diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp index d2fbd9fb6b0192b3728678ae92de7bf8e44e3620..644a27c586375bc66d327e18ac5182e8fce2893b 100644 --- a/src/operators/batchnorm_op.cpp +++ b/src/operators/batchnorm_op.cpp @@ -35,7 +35,6 @@ namespace ops = paddle_mobile::operators; REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU -USE_OP_MALI_GPU(batch_norm); REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp); #endif #ifdef PADDLE_MOBILE_FPGA diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp index be70370f9de0963bbe6625513257be890e36dacb..656d30c4e1921914d3fe80d930f4219d73f025ea 100644 --- a/src/operators/fusion_conv_add.cpp +++ b/src/operators/fusion_conv_add.cpp @@ -76,7 +76,7 @@ namespace ops = paddle_mobile::operators; REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU -REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp); +REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h index 02c9d910b955fd9398df9406a5f730c4a7abbfee..bc623efc8c9288a6007a8141220a2d40b29d7bdb 100644 --- a/src/operators/fusion_conv_add.h +++ b/src/operators/fusion_conv_add.h @@ -96,7 +96,7 @@ static framework::FusionOpRegistrar convadd_registrar( USE_OP_CPU(fusion_conv_add); #endif #ifdef PADDLE_MOBILE_MALI_GPU -USE_OP_MALI_GPU(conv_add); +USE_OP_MALI_GPU(fusion_conv_add); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp index 1b2a46defc520519e0fb61779cf45059f0a54913..0ca3c26c47e91771ab7019261f78815ad8463408 100644 --- a/src/operators/fusion_fc_op.cpp +++ b/src/operators/fusion_fc_op.cpp @@ -80,7 +80,7 @@ namespace ops = paddle_mobile::operators; REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp); #endif #ifdef PADDLE_MOBILE_MALI_GPU -REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp); +REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h index c07d59e31e8124325d48a5b9ff208e1e425146a8..b545bb8380aedcc31200080beaec4c56f137ad13 100644 --- a/src/operators/fusion_fc_op.h +++ b/src/operators/fusion_fc_op.h @@ -78,7 +78,7 @@ extern framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); #ifndef CONV_CPU_REGISTER #define CONV_CPU_REGISTER -static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); +extern framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); #endif #endif @@ -93,7 +93,7 @@ static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); USE_OP_CPU(fusion_fc); #endif #ifdef PADDLE_MOBILE_MALI_GPU -USE_OP_MALI_GPU(fc); +USE_OP_MALI_GPU(fusion_fc); #endif #ifdef PADDLE_MOBILE_FPGA #endif diff --git a/src/operators/kernel/mali/acl_operator.cc b/src/operators/kernel/mali/acl_operator.cc old mode 100644 new mode 100755 diff --git a/src/operators/kernel/mali/acl_operator.h b/src/operators/kernel/mali/acl_operator.h old mode 100644 new mode 100755 index c2e13283b1c679d6dfc8972af5ace5e579d568e6..bf8200d486f91998c79540177ab1b26596a3e9dc --- a/src/operators/kernel/mali/acl_operator.h +++ b/src/operators/kernel/mali/acl_operator.h @@ -225,6 +225,7 @@ class AclParameters { bool is_global_pool; bool is_channel_concat; + bool is_bypass; std::vector in_tensor; }; diff --git a/src/operators/kernel/mali/acl_tensor.cc b/src/operators/kernel/mali/acl_tensor.cc old mode 100644 new mode 100755 diff --git a/src/operators/kernel/mali/acl_tensor.h b/src/operators/kernel/mali/acl_tensor.h old mode 100644 new mode 100755 diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp old mode 100644 new mode 100755 index 22ce472c464bc9ed89ee721244e9873c01601ebd..ad648d615cd8f9134b212d484d7174c95e027551 --- a/src/operators/kernel/mali/batchnorm_kernel.cpp +++ b/src/operators/kernel/mali/batchnorm_kernel.cpp @@ -71,6 +71,7 @@ class AclBatchNormOp : public acl::ACLOperator { bool Bypass_acl(const BatchNormParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_) { bypass_acl = true; @@ -135,6 +136,10 @@ bool BatchNormKernel::Init(BatchNormParam* param) { acl_op = new AclBatchNormOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + std::cout << "init acl failed" << std::endl; + return false; + } return true; } @@ -147,15 +152,8 @@ void BatchNormKernel::Compute( if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; - return; - } acl::AclParameters& args = acl_op->getargs(); - const float* input_data = (const float*)args.input_data; - const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); - acl_op->RunAcl((void*)input_data, (void*)output_data); + acl_op->RunAcl(args.input_data, args.output_data); } template class BatchNormKernel; diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp old mode 100644 new mode 100755 index 08ee58d41577dfb5fd3a99755d66b5677b7b7ed2..aaa586b6d977bfca96e596261ec090637cf87207 --- a/src/operators/kernel/mali/concat_kernel.cpp +++ b/src/operators/kernel/mali/concat_kernel.cpp @@ -50,8 +50,6 @@ class AclConcatOp : public acl::ACLOperator { T type; for (int i = 0; i < input_data->size(); i++) { - const T* idata = (*input_data)[i]->data(); - const T* pdata = (*input_data)[i]->data(); int in_batch = (*input_data)[i]->dims()[0]; int in_channels = (*input_data)[i]->dims()[1]; int in_width = (*input_data)[i]->dims()[2]; @@ -75,6 +73,7 @@ class AclConcatOp : public acl::ACLOperator { bool Bypass_acl(const ConcatParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_ || !args.is_channel_concat) { bypass_acl = true; @@ -103,13 +102,17 @@ class AclConcatOp : public acl::ACLOperator { }; template <> -bool ConcatKernel::Init(const ConcatParam& param) const { +bool ConcatKernel::Init(ConcatParam* param) { AclConcatOp* acl_op = reinterpret_cast*>(this->GetAclOp()); if (acl_op == nullptr) { acl_op = new AclConcatOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + std::cout << "init acl failed" << std::endl; + return false; + } return true; } @@ -121,15 +124,8 @@ void ConcatKernel::Compute(const ConcatParam& param) const { if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; - return; - } acl::AclParameters& args = acl_op->getargs(); - std::vector temp_data = args.in_tensor; - const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); - acl_op->RunAcl(temp_data, (void*)output_data); + acl_op->RunAcl(args.in_tensor, args.output_data); } template class ConcatKernel; diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp index 1d34910231c086673c58d8dba2c1e44992b5d593..318db016d12f36981c07627139bcc49d07162d52 100644 --- a/src/operators/kernel/mali/conv_add_kernel.cpp +++ b/src/operators/kernel/mali/conv_add_kernel.cpp @@ -55,7 +55,8 @@ class AclConvAddOp : public acl::ACLOperator { set_operator_init_done(); this->force_bypass_acl_path_ = false; - check_direct_conv(); + // check_direct_conv(); + group() = args.num_group; //[kernel_x, kernel_y, IFM, OFM] new_tensor(weights(), weights_shape, args.weight_data); //[OFM] @@ -63,8 +64,6 @@ class AclConvAddOp : public acl::ACLOperator { new_tensor(biases(), biases_shape, args.biases_data); } - group() = args.num_group; - //[width, height, IFM] new_tensor(input(), input_shape, args.input_data); //[width, height, OFM] @@ -79,6 +78,7 @@ class AclConvAddOp : public acl::ACLOperator { bool Bypass_acl(const FusionConvAddParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_ || args.num_group >= 5) { bypass_acl = true; @@ -196,14 +196,17 @@ class AclConvAddOp : public acl::ACLOperator { }; template <> -bool ConvAddKernel::Init( - const FusionConvAddParam& param) const { +bool ConvAddKernel::Init(FusionConvAddParam* param) { AclConvAddOp* acl_op = reinterpret_cast*>(this->GetAclOp()); if (acl_op == nullptr) { acl_op = new AclConvAddOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + std::cout << "init acl failed" << std::endl; + return false; + } return true; } @@ -216,15 +219,9 @@ void ConvAddKernel::Compute( if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; - return; - } acl::AclParameters& args = acl_op->getargs(); - const float* input_data = (const float*)args.input_data; - const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); - acl_op->RunAcl((void*)input_data, (void*)output_data); + + acl_op->RunAcl(args.input_data, args.output_data); } template class ConvAddKernel; diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp old mode 100644 new mode 100755 index 36f438605317dd016d2f44cf9c5efc0ab33c5923..c548977ebaa34fabc1b1fe54d6db9690bcb424f1 --- a/src/operators/kernel/mali/conv_kernel.cpp +++ b/src/operators/kernel/mali/conv_kernel.cpp @@ -79,6 +79,7 @@ class AclConvOp : public acl::ACLOperator { bool Bypass_acl(const ConvParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_ || args.num_group >= 5) { bypass_acl = true; @@ -202,6 +203,10 @@ bool ConvKernel::Init(ConvParam* param) { acl_op = new AclConvOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + std::cout << "init acl failed" << std::endl; + return false; + } return true; } @@ -213,15 +218,8 @@ void ConvKernel::Compute(const ConvParam& param) const { if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; - return; - } acl::AclParameters& args = acl_op->getargs(); - const float* input_data = (const float*)args.input_data; - const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); - acl_op->RunAcl((void*)input_data, (void*)output_data); + acl_op->RunAcl(args.input_data, args.output_data); } template class ConvKernel; diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp old mode 100644 new mode 100755 diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp old mode 100644 new mode 100755 diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp index c063ec8783382ccef79086368df8a97320010c23..4fb5fca8695dccc45c7169d8572618965b3d84a3 100644 --- a/src/operators/kernel/mali/lrn_kernel.cpp +++ b/src/operators/kernel/mali/lrn_kernel.cpp @@ -20,6 +20,7 @@ limitations under the License. */ #ifdef PADDLE_MOBILE_MALI_GPU #include "acl_operator.h" #include "framework/operator.h" +#include "operators/kernel/central-arm-func/lrn_arm_func.h" #include "operators/op_param.h" namespace paddle_mobile { @@ -59,12 +60,15 @@ class AclLrnOp : public acl::ACLOperator { acl_configure(lrn, this, norm_info); } + void Set_bypass(bool bypass) { args.is_bypass = bypass; } + void RunAcl(void* input, void* output) { acl::ACLOperator::acl_run(input, output); } bool Bypass_acl(const LrnParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_) { bypass_acl = true; @@ -107,13 +111,18 @@ class AclLrnOp : public acl::ACLOperator { }; template <> -bool LrnKernel::Init(const LrnParam& param) const { +bool LrnKernel::Init(LrnParam* param) { AclLrnOp* acl_op = reinterpret_cast*>(this->GetAclOp()); if (acl_op == nullptr) { acl_op = new AclLrnOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + acl_op->Set_bypass(true); + std::cout << "init acl failed" << std::endl; + return true; + } return true; } @@ -125,14 +134,14 @@ void LrnKernel::Compute(const LrnParam& param) const { if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; + acl::AclParameters& args = acl_op->getargs(); + if (args.is_bypass) { + std::cout << "bypass op" << std::endl; + LrnCompute(param); return; } - acl::AclParameters& args = acl_op->getargs(); const float* input_data = (const float*)args.input_data; const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); for (int n = 0; n < args.batch; ++n) { acl_op->RunAcl((void*)input_data, (void*)output_data); input_data += args.in_depth * args.in_cols * args.in_rows; diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp old mode 100644 new mode 100755 diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp old mode 100644 new mode 100755 index 9de90deebca05ef50cf94fa958f37bbcf1a08c4b..1f49391341d69a0690352c69c9c208550f8e1c24 --- a/src/operators/kernel/mali/pool_kernel.cpp +++ b/src/operators/kernel/mali/pool_kernel.cpp @@ -82,6 +82,7 @@ class AclPoolOp : public acl::ACLOperator { bool Bypass_acl(const PoolParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_) { bypass_acl = true; @@ -179,13 +180,17 @@ class AclPoolOp : public acl::ACLOperator { }; template <> -bool PoolKernel::Init(const PoolParam& param) const { +bool PoolKernel::Init(PoolParam* param) { AclPoolOp* acl_op = reinterpret_cast*>(this->GetAclOp()); if (acl_op == nullptr) { acl_op = new AclPoolOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + std::cout << "init acl failed" << std::endl; + return false; + } return true; } @@ -197,14 +202,9 @@ void PoolKernel::Compute(const PoolParam& param) const { if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; - return; - } acl::AclParameters& args = acl_op->getargs(); const float* input_data = (const float*)args.input_data; const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); for (int n = 0; n < args.batch; ++n) { acl_op->RunAcl((void*)input_data, (void*)output_data); input_data += args.in_depth * args.in_cols * args.in_rows; diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp old mode 100644 new mode 100755 index 3deebc9d2f1a9f652813362f4947f744f0541482..1a8c0f88543199e7a863cc44b5b0a6be3bc6212d --- a/src/operators/kernel/mali/relu_kernel.cpp +++ b/src/operators/kernel/mali/relu_kernel.cpp @@ -41,10 +41,10 @@ class AclReluOp : public acl::ACLOperator { acl::AclParameters& getargs() { return args; } void InitAclLayer(const ReluParam& param) { setTargetHint(acl::TargetHint::OPENCL); - arm_compute::TensorShape input_shape(args.in_cols * args.in_rows * - args.in_depth * args.batch); - arm_compute::TensorShape output_shape(args.in_cols * args.in_rows * - args.in_depth * args.out_num); + arm_compute::TensorShape input_shape(args.in_cols, args.in_rows, + args.in_depth, args.batch); + arm_compute::TensorShape output_shape(args.in_cols, args.in_rows, + args.in_depth, args.out_num); // arm_compute::TensorShape weights_shape( // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth); // arm_compute::TensorShape biases_shape(args.out_depth); @@ -71,6 +71,7 @@ class AclReluOp : public acl::ACLOperator { bool Bypass_acl(const ReluParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_) { bypass_acl = true; @@ -99,13 +100,17 @@ class AclReluOp : public acl::ACLOperator { }; template <> -bool ReluKernel::Init(const ReluParam& param) const { +bool ReluKernel::Init(ReluParam* param) { AclReluOp* acl_op = reinterpret_cast*>(this->GetAclOp()); if (acl_op == nullptr) { acl_op = new AclReluOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + std::cout << "init acl failed" << std::endl; + return false; + } return true; } @@ -117,15 +122,8 @@ void ReluKernel::Compute(const ReluParam& param) const { if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; - return; - } acl::AclParameters& args = acl_op->getargs(); - const float* input_data = (const float*)args.input_data; - const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); - acl_op->RunAcl((void*)input_data, (void*)output_data); + acl_op->RunAcl(args.input_data, args.output_data); } template class ReluKernel; diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp old mode 100644 new mode 100755 diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp old mode 100644 new mode 100755 index 36edb3724600ada43606c23b1989615183ff21e8..37d2f2b6b1656602e5acfecd3ac79733f570844d --- a/src/operators/kernel/mali/softmax_kernel.cpp +++ b/src/operators/kernel/mali/softmax_kernel.cpp @@ -61,6 +61,7 @@ class AclSoftmaxOp : public acl::ACLOperator { bool Bypass_acl(const SoftmaxParam& param) { bool bypass_acl = false; AclParametersByContext(param); + InitAclLayer(param); // for performance, more groups impact GPU performance if (this->force_bypass_acl_path_) { bypass_acl = true; @@ -96,13 +97,17 @@ class AclSoftmaxOp : public acl::ACLOperator { }; template <> -bool SoftmaxKernel::Init(const SoftmaxParam& param) const { +bool SoftmaxKernel::Init(SoftmaxParam* param) { AclSoftmaxOp* acl_op = reinterpret_cast*>(this->GetAclOp()); if (acl_op == nullptr) { acl_op = new AclSoftmaxOp(); this->SetAclOp((void*)acl_op, (void*)this); } + if (acl_op->Bypass_acl(*param)) { + std::cout << "init acl failed" << std::endl; + return false; + } return true; } @@ -114,14 +119,10 @@ void SoftmaxKernel::Compute(const SoftmaxParam& param) const { if (acl_op == nullptr) { return; } - if (acl_op->Bypass_acl(param)) { - std::cout << "init acl failed" << std::endl; - return; - } acl::AclParameters& args = acl_op->getargs(); const float* input_data = (const float*)args.input_data; const float* output_data = (const float*)args.output_data; - acl_op->InitAclLayer(param); + for (int n = 0; n < args.out_num; ++n) { acl_op->RunAcl((void*)input_data, (void*)output_data); input_data += args.in_depth;