提交 e73e9a9a 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4261 [MS][LITE] fix arm fp32 op bug: conv_depthwise_3x3, batchnorm, scale, etc.

Merge pull request !4261 from yangruoqi713/test_dw
...@@ -40,7 +40,6 @@ ...@@ -40,7 +40,6 @@
#include "src/runtime/kernel/arm/nnacl/fp32/reduce.h" #include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
#include "src/runtime/kernel/arm/nnacl/fp32/activation.h" #include "src/runtime/kernel/arm/nnacl/fp32/activation.h"
#include "src/runtime/kernel/arm/nnacl/fp32/arithmetic.h" #include "src/runtime/kernel/arm/nnacl/fp32/arithmetic.h"
#include "src/runtime/kernel/arm/nnacl/fused_batchnorm.h"
#include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h" #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
#include "src/runtime/kernel/arm/nnacl/power.h" #include "src/runtime/kernel/arm/nnacl/power.h"
#include "src/runtime/kernel/arm/nnacl/fp32/range.h" #include "src/runtime/kernel/arm/nnacl/fp32/range.h"
...@@ -510,15 +509,15 @@ OpParameter *PopulateActivationParameter(const lite::Primitive *primitive) { ...@@ -510,15 +509,15 @@ OpParameter *PopulateActivationParameter(const lite::Primitive *primitive) {
} }
OpParameter *PopulateFusedBatchNorm(const lite::Primitive *primitive) { OpParameter *PopulateFusedBatchNorm(const lite::Primitive *primitive) {
FusedBatchNormParameter *fuse_batch_norm_param = new (std::nothrow) FusedBatchNormParameter(); BatchNormParameter *batch_norm_param = new (std::nothrow) BatchNormParameter();
if (fuse_batch_norm_param == nullptr) { if (batch_norm_param == nullptr) {
MS_LOG(ERROR) << "new FusedBatchNormParameter failed."; MS_LOG(ERROR) << "new FusedBatchNormParameter failed.";
return nullptr; return nullptr;
} }
fuse_batch_norm_param->op_parameter_.type_ = primitive->Type(); batch_norm_param->op_parameter_.type_ = primitive->Type();
auto param = primitive->Value()->value_as_FusedBatchNorm(); auto param = primitive->Value()->value_as_FusedBatchNorm();
fuse_batch_norm_param->epsilon_ = param->epsilon(); batch_norm_param->epsilon_ = param->epsilon();
return reinterpret_cast<OpParameter *>(fuse_batch_norm_param); return reinterpret_cast<OpParameter *>(batch_norm_param);
} }
OpParameter *PopulateArithmetic(const lite::Primitive *primitive) { OpParameter *PopulateArithmetic(const lite::Primitive *primitive) {
......
...@@ -28,6 +28,22 @@ using mindspore::lite::RET_OK; ...@@ -28,6 +28,22 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() {
delete sliding_;
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() { int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
// malloc pack input buffer // malloc pack input buffer
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
...@@ -113,8 +129,14 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() { ...@@ -113,8 +129,14 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
} }
int ConvolutionDepthwiseFp16CPUKernel::ReSize() { int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
free(packed_input_); if (packed_input_ != nullptr) {
free(packed_output_); delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
ConvolutionBaseCPUKernel::Init(); ConvolutionBaseCPUKernel::Init();
InitSlidingParam(sliding_, conv_param_, C8NUM); InitSlidingParam(sliding_, conv_param_, C8NUM);
......
...@@ -29,12 +29,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { ...@@ -29,12 +29,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwiseFp16CPUKernel() override { ~ConvolutionDepthwiseFp16CPUKernel() override;
delete sliding_;
free(packed_weight_);
free(packed_input_);
free(packed_output_);
}
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
......
...@@ -28,6 +28,22 @@ using mindspore::lite::RET_OK; ...@@ -28,6 +28,22 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() {
delete sliding_;
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N); conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N);
conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H); conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H);
...@@ -126,8 +142,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() { ...@@ -126,8 +142,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() {
} }
int DeconvolutionDepthwiseFp16CPUKernel::ReSize() { int DeconvolutionDepthwiseFp16CPUKernel::ReSize() {
free(packed_input_); if (packed_input_ != nullptr) {
free(packed_output_); delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
InitSlideParam(); InitSlideParam();
ConvolutionBaseCPUKernel::Init(); ConvolutionBaseCPUKernel::Init();
......
...@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { ...@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~DeconvolutionDepthwiseFp16CPUKernel() override { ~DeconvolutionDepthwiseFp16CPUKernel() override;
delete sliding_;
free(packed_weight_);
if (need_align_) {
free(packed_input_);
free(packed_output_);
}
};
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
...@@ -52,7 +45,6 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { ...@@ -52,7 +45,6 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
float16_t *packed_weight_; float16_t *packed_weight_;
float16_t *packed_input_; float16_t *packed_input_;
float16_t *packed_output_; float16_t *packed_output_;
bool need_align_ = false;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
*/ */
#include "src/runtime/kernel/arm/fp32/batchnorm.h" #include "src/runtime/kernel/arm/fp32/batchnorm.h"
#include <cmath>
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "include/errorcode.h" #include "include/errorcode.h"
...@@ -28,7 +27,42 @@ using mindspore::lite::RET_OK; ...@@ -28,7 +27,42 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_BatchNorm; using mindspore::schema::PrimitiveType_BatchNorm;
namespace mindspore::kernel { namespace mindspore::kernel {
BatchnormCPUKernel::~BatchnormCPUKernel() {
if (mean_addr_ != nullptr) {
free(mean_addr_);
mean_addr_ = nullptr;
}
if (var_addr_ != nullptr) {
free(var_addr_);
var_addr_ = nullptr;
}
}
int BatchnormCPUKernel::InitConstTensor() {
auto mean = inputs_[1];
mean_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
if (mean_addr_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memcpy(mean_addr_, mean->Data(), mean->ElementsNum() * sizeof(float));
auto variance = inputs_[2];
var_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
if (var_addr_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memcpy(var_addr_, variance->Data(), variance->ElementsNum() * sizeof(float));
return RET_OK;
}
int BatchnormCPUKernel::Init() { int BatchnormCPUKernel::Init() {
if (context_->infer_shape_interrupt_ && !context_->running_) {
SetNeedReInit();
return RET_OK;
}
auto input_shapes = inputs_[0]->shape(); auto input_shapes = inputs_[0]->shape();
auto n_dim = input_shapes.size(); auto n_dim = input_shapes.size();
batchnorm_param_->channel_ = input_shapes[n_dim - 1]; batchnorm_param_->channel_ = input_shapes[n_dim - 1];
...@@ -37,11 +71,24 @@ int BatchnormCPUKernel::Init() { ...@@ -37,11 +71,24 @@ int BatchnormCPUKernel::Init() {
batchnorm_param_->unit_ *= input_shapes[i]; batchnorm_param_->unit_ *= input_shapes[i];
} }
batchnorm_param_->op_parameter_.thread_num_ = batchnorm_param_->op_parameter_.thread_num_ =
MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->unit_); MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_);
auto ret = InitConstTensor();
if (ret != 0) {
MS_LOG(ERROR) << "Batchnorm fp32 InitConstTensor failed.";
return RET_ERROR;
}
return RET_OK; return RET_OK;
} }
int BatchnormCPUKernel::ReSize() { return RET_OK; } int BatchnormCPUKernel::ReSize() {
auto input_shapes = inputs_[0]->shape();
batchnorm_param_->unit_ = 1;
for (int i = 0; i < input_shapes.size() - 1; i++) {
batchnorm_param_->unit_ *= input_shapes[i];
}
return RET_OK;
}
int BatchnormCPUKernel::DoExecute(int task_id) { int BatchnormCPUKernel::DoExecute(int task_id) {
BatchNorm(out_addr_, in_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_); BatchNorm(out_addr_, in_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_);
...@@ -61,12 +108,10 @@ int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) { ...@@ -61,12 +108,10 @@ int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
int BatchnormCPUKernel::Run() { int BatchnormCPUKernel::Run() {
auto prepare_ret = Prepare(); auto prepare_ret = Prepare();
if (prepare_ret != RET_OK) { if (prepare_ret != RET_OK) {
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; MS_LOG(ERROR) << "Prepare fail! Ret error code: " << prepare_ret;
return prepare_ret; return prepare_ret;
} }
in_addr_ = reinterpret_cast<float *>(inputs_.at(0)->Data()); in_addr_ = reinterpret_cast<float *>(inputs_.at(0)->Data());
mean_addr_ = reinterpret_cast<float *>(inputs_.at(1)->Data());
var_addr_ = reinterpret_cast<float *>(inputs_.at(2)->Data());
out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data()); out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data());
int ret = LiteBackendParallelLaunch(BatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_); int ret = LiteBackendParallelLaunch(BatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_);
......
...@@ -31,14 +31,14 @@ class BatchnormCPUKernel : public LiteKernel { ...@@ -31,14 +31,14 @@ class BatchnormCPUKernel : public LiteKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) { : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
opParameter->thread_num_ = ctx->thread_num_;
batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter); batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter);
} }
~BatchnormCPUKernel() override = default; ~BatchnormCPUKernel() override;
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
int Run() override; int Run() override;
int InitConstTensor();
int DoExecute(int tid); int DoExecute(int tid);
private: private:
......
...@@ -29,6 +29,24 @@ using mindspore::lite::RET_OK; ...@@ -29,6 +29,24 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() {
delete sliding_;
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
if (need_align_) {
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
}
int ConvolutionDepthwiseCPUKernel::InitWeightBias() { int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1 // init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = inputs_[kWeightIndex]; auto weight_tensor = inputs_[kWeightIndex];
...@@ -114,9 +132,16 @@ int ConvolutionDepthwiseCPUKernel::Init() { ...@@ -114,9 +132,16 @@ int ConvolutionDepthwiseCPUKernel::Init() {
int ConvolutionDepthwiseCPUKernel::ReSize() { int ConvolutionDepthwiseCPUKernel::ReSize() {
if (need_align_) { if (need_align_) {
free(packed_input_); if (packed_input_ != nullptr) {
free(packed_output_); delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
} }
// conv base init // conv base init
ConvolutionBaseCPUKernel::Init(); ConvolutionBaseCPUKernel::Init();
...@@ -197,10 +222,11 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T ...@@ -197,10 +222,11 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx, primitive); kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx, primitive);
// auto param = reinterpret_cast<ConvParameter *>(opParameter); // auto param = reinterpret_cast<ConvParameter *>(opParameter);
// if (param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && param->stride_w_ == 1 && // if (param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && param->stride_w_ == 1 &&
// param->dilation_h_ == 1 && param->dilation_w_ == 1) { // param->dilation_h_ == 1 && param->dilation_w_ == 1) {
// kernel = new (std::nothrow) kernel::ConvolutionDepthwise3x3CPUKernel(opParameter, inputs, outputs, ctx); // kernel = new (std::nothrow) kernel::ConvolutionDepthwise3x3CPUKernel(opParameter, inputs, outputs, ctx,
// primitive);
// } else { // } else {
// kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx); // kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx, primitive);
// } // }
if (kernel == nullptr) { if (kernel == nullptr) {
......
...@@ -29,14 +29,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { ...@@ -29,14 +29,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwiseCPUKernel() override { ~ConvolutionDepthwiseCPUKernel() override;
delete sliding_;
free(packed_weight_);
if (need_align_) {
free(packed_input_);
free(packed_output_);
}
};
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
......
...@@ -27,6 +27,24 @@ using mindspore::lite::RET_OK; ...@@ -27,6 +27,24 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() {
delete sliding_;
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
if (need_align_) {
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
}
int DeconvolutionDepthwiseCPUKernel::InitSlideParam() { int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N); conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N);
conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H); conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H);
...@@ -126,8 +144,14 @@ int DeconvolutionDepthwiseCPUKernel::Init() { ...@@ -126,8 +144,14 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
int DeconvolutionDepthwiseCPUKernel::ReSize() { int DeconvolutionDepthwiseCPUKernel::ReSize() {
if (need_align_) { if (need_align_) {
free(packed_input_); if (packed_input_ != nullptr) {
free(packed_output_); delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
} }
InitSlideParam(); InitSlideParam();
......
...@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel { ...@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~DeconvolutionDepthwiseCPUKernel() override { ~DeconvolutionDepthwiseCPUKernel() override;
delete sliding_;
free(packed_weight_);
if (need_align_) {
free(packed_input_);
free(packed_output_);
}
};
int Init() override; int Init() override;
int InitSlideParam(); int InitSlideParam();
......
...@@ -32,6 +32,12 @@ int FlattenCPUKernel::Init() { ...@@ -32,6 +32,12 @@ int FlattenCPUKernel::Init() {
SetNeedReInit(); SetNeedReInit();
return RET_OK; return RET_OK;
} }
ReSize();
return RET_OK;
}
int FlattenCPUKernel::ReSize() {
auto output_shape = outputs_[0]->shape(); auto output_shape = outputs_[0]->shape();
flatten_param_->size = sizeof(float); flatten_param_->size = sizeof(float);
for (int i = 0; i < output_shape.size(); i++) { for (int i = 0; i < output_shape.size(); i++) {
...@@ -40,8 +46,6 @@ int FlattenCPUKernel::Init() { ...@@ -40,8 +46,6 @@ int FlattenCPUKernel::Init() {
return RET_OK; return RET_OK;
} }
int FlattenCPUKernel::ReSize() { return RET_OK; }
int FlattenCPUKernel::Run() { int FlattenCPUKernel::Run() {
auto prepare_ret = Prepare(); auto prepare_ret = Prepare();
if (prepare_ret != RET_OK) { if (prepare_ret != RET_OK) {
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
*/ */
#include "src/runtime/kernel/arm/fp32/fused_batchnorm.h" #include "src/runtime/kernel/arm/fp32/fused_batchnorm.h"
#include <cmath>
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "include/errorcode.h" #include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
using mindspore::kernel::KERNEL_ARCH::kCPU; using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar; using mindspore::lite::KernelRegistrar;
...@@ -27,33 +27,121 @@ using mindspore::lite::RET_OK; ...@@ -27,33 +27,121 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_FusedBatchNorm; using mindspore::schema::PrimitiveType_FusedBatchNorm;
namespace mindspore::kernel { namespace mindspore::kernel {
FusedBatchnormCPUKernel::~FusedBatchnormCPUKernel() {
if (scale_addr_ != nullptr) {
free(scale_addr_);
scale_addr_ = nullptr;
}
if (offset_addr_ != nullptr) {
free(offset_addr_);
offset_addr_ = nullptr;
}
if (mean_addr_ != nullptr) {
free(mean_addr_);
mean_addr_ = nullptr;
}
if (var_addr_ != nullptr) {
free(var_addr_);
var_addr_ = nullptr;
}
}
int FusedBatchnormCPUKernel::InitConstTensor() {
auto scale = inputs_[1];
scale_addr_ = reinterpret_cast<float *>(malloc(scale->ElementsNum() * sizeof(float)));
if (scale_addr_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memcpy(scale_addr_, scale->Data(), scale->ElementsNum() * sizeof(float));
auto offset = inputs_[2];
offset_addr_ = reinterpret_cast<float *>(malloc(offset->ElementsNum() * sizeof(float)));
if (offset_addr_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memcpy(offset_addr_, offset->Data(), offset->ElementsNum() * sizeof(float));
auto mean = inputs_[3];
mean_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
if (mean_addr_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memcpy(mean_addr_, mean->Data(), mean->ElementsNum() * sizeof(float));
auto variance = inputs_[4];
var_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
if (var_addr_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memcpy(var_addr_, variance->Data(), variance->ElementsNum() * sizeof(float));
return RET_OK;
}
int FusedBatchnormCPUKernel::Init() { int FusedBatchnormCPUKernel::Init() {
if (context_->infer_shape_interrupt_ && !context_->running_) { if (context_->infer_shape_interrupt_ && !context_->running_) {
SetNeedReInit(); SetNeedReInit();
return RET_OK; return RET_OK;
} }
input_shape_ = reinterpret_cast<int *>(malloc(sizeof(int) * inputs_[0]->shape().size())); auto input_shapes = inputs_[0]->shape();
memcpy(input_shape_, inputs_[0]->shape().data(), inputs_[0]->shape().size() * sizeof(int)); auto n_dim = input_shapes.size();
batchnorm_param_->channel_ = input_shapes[n_dim - 1];
batchnorm_param_->unit_ = 1;
for (int i = 0; i < n_dim - 1; i++) {
batchnorm_param_->unit_ *= input_shapes[i];
}
batchnorm_param_->op_parameter_.thread_num_ =
MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_);
auto ret = InitConstTensor();
if (ret != 0) {
MS_LOG(ERROR) << "FusedBatchnorm fp32 InitConstTensor failed.";
return RET_ERROR;
}
return RET_OK;
}
int FusedBatchnormCPUKernel::ReSize() {
auto input_shapes = inputs_[0]->shape();
batchnorm_param_->unit_ = 1;
for (int i = 0; i < input_shapes.size() - 1; i++) {
batchnorm_param_->unit_ *= input_shapes[i];
}
return RET_OK;
}
int FusedBatchnormCPUKernel::Execute(int task_id) {
FusedBatchNorm(out_addr_, in_addr_, scale_addr_, offset_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_);
return RET_OK; return RET_OK;
} }
int FusedBatchnormCPUKernel::ReSize() { return RET_OK; } int FusedBatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
auto g_kernel = reinterpret_cast<FusedBatchnormCPUKernel *>(cdata);
auto ret = g_kernel->Execute(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "FusedBatchnormRun error task_id[" << task_id << "] error_code[" << ret << "]";
return ret;
}
return RET_OK;
}
int FusedBatchnormCPUKernel::Run() { int FusedBatchnormCPUKernel::Run() {
auto prepare_ret = Prepare(); auto prepare_ret = Prepare();
if (prepare_ret != RET_OK) { if (prepare_ret != RET_OK) {
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; MS_LOG(ERROR) << "Prepare fail! Ret error code: " << prepare_ret;
return prepare_ret; return prepare_ret;
} }
auto input_addr = reinterpret_cast<float *>(inputs_.at(0)->Data()); in_addr_ = reinterpret_cast<float *>(inputs_.at(0)->Data());
auto scale_addr = reinterpret_cast<float *>(inputs_.at(1)->Data()); out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data());
auto offest_addr = reinterpret_cast<float *>(inputs_.at(2)->Data());
auto mean_addr = reinterpret_cast<float *>(inputs_.at(3)->Data());
auto variance_addr = reinterpret_cast<float *>(inputs_.at(4)->Data());
auto output_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
FusedBatchNorm(input_addr, scale_addr, offest_addr, mean_addr, variance_addr, input_shape_, int ret = LiteBackendParallelLaunch(FusedBatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_);
fused_batchnorm_param_->epsilon_, output_addr); if (ret != RET_OK) {
MS_LOG(ERROR) << "FusedBatchnormRun error error_code[" << ret << "]";
return ret;
}
return RET_OK; return RET_OK;
} }
...@@ -63,8 +151,8 @@ kernel::LiteKernel *CpuFusedBatchnormKernelCreator(const std::vector<lite::tenso ...@@ -63,8 +151,8 @@ kernel::LiteKernel *CpuFusedBatchnormKernelCreator(const std::vector<lite::tenso
const kernel::KernelKey &desc, const lite::Primitive *primitive) { const kernel::KernelKey &desc, const lite::Primitive *primitive) {
MS_ASSERT(opParameter != nullptr); MS_ASSERT(opParameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_FusedBatchNorm); MS_ASSERT(desc.type == schema::PrimitiveType_FusedBatchNorm);
FusedBatchnormCPUKernel *kernel = new (std::nothrow) FusedBatchnormCPUKernel(opParameter, inputs, outputs, ctx, FusedBatchnormCPUKernel *kernel =
primitive); new (std::nothrow) FusedBatchnormCPUKernel(opParameter, inputs, outputs, ctx, primitive);
if (kernel == nullptr) { if (kernel == nullptr) {
MS_LOG(ERROR) << "new FusedBatchnormCPUKernel fail!"; MS_LOG(ERROR) << "new FusedBatchnormCPUKernel fail!";
return nullptr; return nullptr;
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <vector> #include <vector>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/nnacl/fused_batchnorm.h" #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
namespace mindspore::kernel { namespace mindspore::kernel {
class FusedBatchnormCPUKernel : public LiteKernel { class FusedBatchnormCPUKernel : public LiteKernel {
...@@ -28,17 +28,26 @@ class FusedBatchnormCPUKernel : public LiteKernel { ...@@ -28,17 +28,26 @@ class FusedBatchnormCPUKernel : public LiteKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) { : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
fused_batchnorm_param_ = reinterpret_cast<FusedBatchNormParameter *>(parameter); batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter);
} }
~FusedBatchnormCPUKernel() override { delete fused_batchnorm_param_; } ~FusedBatchnormCPUKernel() override;
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
int Run() override; int Run() override;
int InitConstTensor();
int Execute(int task_id);
private: private:
int *input_shape_{}; float *in_addr_;
FusedBatchNormParameter *fused_batchnorm_param_; float *mean_addr_;
float *var_addr_;
float *scale_addr_;
float *offset_addr_;
float *out_addr_;
BatchNormParameter *batchnorm_param_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include "src/runtime/kernel/arm/fp32/scale.h" #include "src/runtime/kernel/arm/fp32/scale.h"
#include <string.h> #include <string.h>
#include <vector> #include <vector>
#include "src/runtime/kernel/arm/nnacl/scale.h"
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "include/errorcode.h" #include "include/errorcode.h"
...@@ -29,23 +28,29 @@ using mindspore::lite::RET_OK; ...@@ -29,23 +28,29 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Scale; using mindspore::schema::PrimitiveType_Scale;
namespace mindspore::kernel { namespace mindspore::kernel {
ScaleCPUKernel::~ScaleCPUKernel() { FreeTmpBuffer(); }
void ScaleCPUKernel::FreeTmpBuffer() { void ScaleCPUKernel::FreeTmpBuffer() {
if (scale_ != nullptr) { if (scale_param_->const_scale_) {
free(scale_); if (scale_ != nullptr) {
scale_ = nullptr; free(scale_);
scale_ = nullptr;
}
} }
if (offset_ != nullptr) { if (scale_param_->has_offset_) {
free(offset_); if (offset_ != nullptr) {
offset_ = nullptr; free(offset_);
offset_ = nullptr;
}
} }
} }
int ScaleCPUKernel::InitScaleOffset() { int ScaleCPUKernel::InitScaleOffset() {
FreeTmpBuffer(); FreeTmpBuffer();
auto param = reinterpret_cast<ScaleParameter *>(opParameter);
auto scale_tensor = inputs_.at(1); auto scale_tensor = inputs_.at(1);
float *scale_ptr = reinterpret_cast<float *>(inputs_.at(1)->Data()); float *scale_ptr = reinterpret_cast<float *>(inputs_.at(1)->Data());
if (scale_ptr != nullptr) { if (scale_ptr != nullptr) {
scale_param_->const_scale_ = true;
scale_ = reinterpret_cast<float *>(malloc(scale_tensor->ElementsNum() * sizeof(float))); scale_ = reinterpret_cast<float *>(malloc(scale_tensor->ElementsNum() * sizeof(float)));
if (scale_ == nullptr) { if (scale_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
...@@ -53,6 +58,7 @@ int ScaleCPUKernel::InitScaleOffset() { ...@@ -53,6 +58,7 @@ int ScaleCPUKernel::InitScaleOffset() {
} }
memcpy(scale_, scale_ptr, scale_tensor->ElementsNum() * sizeof(float)); memcpy(scale_, scale_ptr, scale_tensor->ElementsNum() * sizeof(float));
} else { } else {
scale_param_->const_scale_ = false;
scale_ = nullptr; scale_ = nullptr;
} }
...@@ -64,40 +70,39 @@ int ScaleCPUKernel::InitScaleOffset() { ...@@ -64,40 +70,39 @@ int ScaleCPUKernel::InitScaleOffset() {
return RET_ERROR; return RET_ERROR;
} }
memcpy(offset_, offset_tensor->Data(), offset_tensor->ElementsNum() * sizeof(float)); memcpy(offset_, offset_tensor->Data(), offset_tensor->ElementsNum() * sizeof(float));
param->has_offset_ = true; scale_param_->has_offset_ = true;
} else { } else {
offset_ = nullptr; offset_ = nullptr;
param->has_offset_ = false; scale_param_->has_offset_ = false;
} }
return RET_OK; return RET_OK;
} }
int ScaleCPUKernel::InitParameter() { int ScaleCPUKernel::InitParameter() {
auto param = reinterpret_cast<ScaleParameter *>(opParameter);
auto in_tensor = inputs_.at(0); auto in_tensor = inputs_.at(0);
auto in_shape = in_tensor->shape(); auto in_shape = in_tensor->shape();
auto scale_tensor = inputs_.at(1); auto scale_tensor = inputs_.at(1);
auto scale_shape = scale_tensor->shape(); auto scale_shape = scale_tensor->shape();
if (scale_shape.size() + param->axis_ > in_shape.size()) { if (scale_shape.size() + scale_param_->axis_ > in_shape.size()) {
MS_LOG(ERROR) << "Scale tensor shape is incorrect."; MS_LOG(ERROR) << "Scale tensor shape is incorrect.";
return RET_ERROR; return RET_ERROR;
} }
param->outer_size_ = 1; scale_param_->outer_size_ = 1;
param->axis_size_ = 1; scale_param_->axis_size_ = 1;
param->inner_size_ = 1; scale_param_->inner_size_ = 1;
for (int i = 0; i < param->axis_; i++) { for (int i = 0; i < scale_param_->axis_; i++) {
param->outer_size_ *= in_shape[i]; scale_param_->outer_size_ *= in_shape[i];
} }
for (int i = 0; i < scale_shape.size(); i++) { for (int i = 0; i < scale_shape.size(); i++) {
if (in_shape[i + param->axis_] != scale_shape[i]) { if (in_shape[i + scale_param_->axis_] != scale_shape[i]) {
MS_LOG(ERROR) << "Scale tensor shape is incorrect."; MS_LOG(ERROR) << "Scale tensor shape is incorrect.";
return RET_ERROR; return RET_ERROR;
} }
param->axis_size_ *= in_shape[i + param->axis_]; scale_param_->axis_size_ *= in_shape[i + scale_param_->axis_];
} }
for (int i = param->axis_ + scale_shape.size(); i < in_shape.size(); i++) { for (int i = scale_param_->axis_ + scale_shape.size(); i < in_shape.size(); i++) {
param->inner_size_ *= in_shape[i]; scale_param_->inner_size_ *= in_shape[i];
} }
return RET_OK; return RET_OK;
} }
...@@ -130,9 +135,7 @@ int ScaleCPUKernel::ReSize() { ...@@ -130,9 +135,7 @@ int ScaleCPUKernel::ReSize() {
} }
int ScaleCPUKernel::Scale(int task_id) { int ScaleCPUKernel::Scale(int task_id) {
auto ret = auto ret = DoScale(input_ptr_, output_ptr_, scale_, offset_, task_id, scale_param_);
DoScale(input_ptr_, output_ptr_, scale_, offset_, task_id, reinterpret_cast<ScaleParameter *>(opParameter));
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Scale error task_id[" << task_id << "] error_code[" << ret << "]"; MS_LOG(ERROR) << "Scale error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR; return RET_ERROR;
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <vector> #include <vector>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/nnacl/scale.h"
namespace mindspore::kernel { namespace mindspore::kernel {
...@@ -27,10 +28,10 @@ class ScaleCPUKernel : public LiteKernel { ...@@ -27,10 +28,10 @@ class ScaleCPUKernel : public LiteKernel {
ScaleCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, ScaleCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {} : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
~ScaleCPUKernel() { scale_param_ = reinterpret_cast<ScaleParameter *>(opParameter);
FreeTmpBuffer();
} }
~ScaleCPUKernel() override;
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
...@@ -45,6 +46,7 @@ class ScaleCPUKernel : public LiteKernel { ...@@ -45,6 +46,7 @@ class ScaleCPUKernel : public LiteKernel {
float *scale_; float *scale_;
float *offset_; float *offset_;
float *output_ptr_; float *output_ptr_;
ScaleParameter *scale_param_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -28,6 +28,24 @@ using mindspore::lite::RET_OK; ...@@ -28,6 +28,24 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
delete sliding;
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (need_align_) {
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
}
int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
// init weight, int8 -> int16 // init weight, int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
...@@ -111,10 +129,17 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { ...@@ -111,10 +129,17 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
} }
int ConvolutionDepthwiseInt8CPUKernel::ReSize() { int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
free(packed_input_); if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (need_align_) { if (need_align_) {
free(packed_output_); if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
} }
// conv base init // conv base init
ConvolutionBaseCPUKernel::Init(); ConvolutionBaseCPUKernel::Init();
......
...@@ -29,14 +29,7 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { ...@@ -29,14 +29,7 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwiseInt8CPUKernel() override { ~ConvolutionDepthwiseInt8CPUKernel() override;
delete sliding;
free(packed_weight_);
free(packed_input_);
if (need_align_) {
free(packed_output_);
}
};
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
......
...@@ -28,6 +28,28 @@ using mindspore::lite::RET_OK; ...@@ -28,6 +28,28 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() {
delete sliding;
if (packed_weight_ != nullptr) {
delete packed_weight_;
packed_weight_ = nullptr;
}
if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (need_align_) {
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
if (output_buffer_ != nullptr) {
delete output_buffer_;
output_buffer_ = nullptr;
}
}
int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() { int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
// init weight: int8 -> int16 // init weight: int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
...@@ -101,9 +123,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { ...@@ -101,9 +123,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
} }
// malloc tmp buffer for int32 output // malloc tmp buffer for int32 output
output_buffer = output_buffer_ =
reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t))); reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
if (output_buffer == nullptr) { if (output_buffer_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
...@@ -144,10 +166,21 @@ int DeconvolutionDepthwiseInt8CPUKernel::Init() { ...@@ -144,10 +166,21 @@ int DeconvolutionDepthwiseInt8CPUKernel::Init() {
} }
int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
free(packed_input_); if (packed_input_ != nullptr) {
delete packed_input_;
packed_input_ = nullptr;
}
if (need_align_) { if (need_align_) {
free(packed_output_); if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
}
if (output_buffer_ != nullptr) {
delete output_buffer_;
output_buffer_ = nullptr;
} }
InitSlideParam(); InitSlideParam();
// conv base init // conv base init
...@@ -162,7 +195,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() { ...@@ -162,7 +195,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
} }
int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) { int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
DeconvDwInt8(packed_output_, output_buffer, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), DeconvDwInt8(packed_output_, output_buffer_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
conv_param_, sliding, task_id); conv_param_, sliding, task_id);
return RET_OK; return RET_OK;
} }
......
...@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { ...@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~DeconvolutionDepthwiseInt8CPUKernel() override { ~DeconvolutionDepthwiseInt8CPUKernel() override;
delete sliding;
free(packed_weight_);
free(packed_input_);
if (need_align_) {
free(packed_output_);
}
};
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
...@@ -52,7 +45,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel { ...@@ -52,7 +45,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
int16_t *packed_weight_; int16_t *packed_weight_;
int16_t *packed_input_; int16_t *packed_input_;
int8_t *packed_output_; int8_t *packed_output_;
int32_t *output_buffer; int32_t *output_buffer_;
bool need_align_ = false; bool need_align_ = false;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -24,4 +24,3 @@ typedef struct FlattenParameter { ...@@ -24,4 +24,3 @@ typedef struct FlattenParameter {
void Flatten(const void *input, void *output, FlattenParameter *flatten_param); void Flatten(const void *input, void *output, FlattenParameter *flatten_param);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FLATTEN_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FLATTEN_H_
...@@ -19,10 +19,21 @@ ...@@ -19,10 +19,21 @@
void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id, void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
BatchNormParameter *param) { BatchNormParameter *param) {
for (int u = task_id; u < param->unit_; u += param->op_parameter_.thread_num_) { for (int c = task_id; c < param->channel_; c += param->op_parameter_.thread_num_) {
for (int c = 0; c < param->channel_; c++) { auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_); for (int u = 0; u < param->unit_; u++) {
output_ptr[u * param->channel_ + c] = (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt; output_ptr[u * param->channel_ + c] = (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt;
} }
} }
} }
void FusedBatchNorm(float *output_ptr, const float *input_ptr, const float *scale_ptr, const float *offest_ptr,
const float *mean_ptr, const float *variance_ptr, int task_id, BatchNormParameter *param) {
for (int c = task_id; c < param->channel_; c += param->op_parameter_.thread_num_) {
auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
for (int u = 0; u < param->unit_; u++) {
output_ptr[u * param->channel_ + c] =
(input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt * scale_ptr[c] + offest_ptr[c];
}
}
}
...@@ -29,4 +29,7 @@ typedef struct BatchNormParameter { ...@@ -29,4 +29,7 @@ typedef struct BatchNormParameter {
void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id, void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
BatchNormParameter *param); BatchNormParameter *param);
void FusedBatchNorm(float *output_ptr, const float *input_ptr, const float *scale_ptr, const float *offest_ptr,
const float *mean_ptr, const float *variance_ptr, int task_id, BatchNormParameter *param);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
...@@ -486,6 +486,21 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia ...@@ -486,6 +486,21 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia
float32x4_t d10 = vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), bias_ptr); float32x4_t d10 = vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), bias_ptr);
float32x4_t d11 = vaddq_f32(vsubq_f32(vsubq_f32(t11, t12), t13), bias_ptr); float32x4_t d11 = vaddq_f32(vsubq_f32(vsubq_f32(t11, t12), t13), bias_ptr);
float32x4_t zeros = {0, 0, 0, 0};
float32x4_t bounds = {6, 6, 6, 6};
if (is_relu) {
d00 = vmaxq_f32(d00, zeros);
d01 = vmaxq_f32(d01, zeros);
d10 = vmaxq_f32(d10, zeros);
d11 = vmaxq_f32(d11, zeros);
}
if (is_relu6) {
d00 = vminq_f32(vmaxq_f32(d00, zeros), bounds);
d01 = vminq_f32(vmaxq_f32(d01, zeros), bounds);
d10 = vminq_f32(vmaxq_f32(d10, zeros), bounds);
d11 = vminq_f32(vmaxq_f32(d11, zeros), bounds);
}
vst1q_f32(dst_output, d00); vst1q_f32(dst_output, d00);
if (w_in_range) { if (w_in_range) {
vst1q_f32(dst_output + channel, d01); vst1q_f32(dst_output + channel, d01);
...@@ -536,6 +551,19 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia ...@@ -536,6 +551,19 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia
float d10 = t10 + t11 + t12 + bias_ptr[0]; float d10 = t10 + t11 + t12 + bias_ptr[0];
float d11 = t11 - t12 - t13 + bias_ptr[0]; float d11 = t11 - t12 - t13 + bias_ptr[0];
if (is_relu) {
d00 = MSMAX(d00, 0);
d01 = MSMAX(d01, 0);
d10 = MSMAX(d10, 0);
d11 = MSMAX(d11, 0);
}
if (is_relu6) {
d00 = MSMIN(MSMAX(d00, 0), 6);
d01 = MSMIN(MSMAX(d01, 0), 6);
d10 = MSMIN(MSMAX(d10, 0), 6);
d11 = MSMIN(MSMAX(d11, 0), 6);
}
(dst_output + i)[0] = d00; (dst_output + i)[0] = d00;
if (w_in_range) { if (w_in_range) {
(dst_output + i + channel)[0] = d01; (dst_output + i + channel)[0] = d01;
......
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "nnacl/fused_batchnorm.h"
#include <math.h>
void FusedBatchNorm(const float *input_ptr, const float *scale_ptr, const float *offest_ptr, const float *mean_ptr,
const float *variance_ptr, int *input_shapes, float epsilon, float *output_ptr) {
int channel = input_shapes[3];
int units = 1;
for (int i = 0; i < 3; i++) {
units *= input_shapes[i];
}
for (int c = 0; c < input_shapes[3]; c++) {
auto variance_sqrt = sqrt(variance_ptr[c] + epsilon);
for (int u = 0; u < units; u++) {
output_ptr[u * channel + c] =
(input_ptr[u * channel + c] - mean_ptr[c]) / variance_sqrt * scale_ptr[c] + offest_ptr[c];
}
}
}
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
#include "nnacl/op_base.h"
typedef struct FusedBatchNormParameter {
OpParameter op_parameter_;
float epsilon_;
} FusedBatchNormParameter;
void FusedBatchNorm(const float *input_ptr, const float *scale_ptr, const float *offest_ptr, const float *mean_ptr,
const float *variance_ptr, int *input_shapes, float epsilon, float *output_ptr);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
...@@ -25,10 +25,9 @@ typedef struct ScaleParameter { ...@@ -25,10 +25,9 @@ typedef struct ScaleParameter {
int axis_size_; int axis_size_;
int inner_size_; int inner_size_;
int axis_; int axis_;
bool has_offset_; bool const_scale_ = false;
// todo yangruoqi: axis bool has_offset_ = false;
} ScaleParameter; } ScaleParameter;
int DoScale(float *in_data, float *out_data, float *scale, float *offset, int task_id, ScaleParameter *scale_param); int DoScale(float *in_data, float *out_data, float *scale, float *offset, int task_id, ScaleParameter *scale_param);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_SCALE_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_SCALE_H_
...@@ -17,33 +17,20 @@ ...@@ -17,33 +17,20 @@
#include "mindspore/core/utils/log_adapter.h" #include "mindspore/core/utils/log_adapter.h"
#include "common/common_test.h" #include "common/common_test.h"
#include "mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h" #include "mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
#include "mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.h"
#include "mindspore/lite/src/kernel_registry.h" #include "mindspore/lite/src/kernel_registry.h"
#include "mindspore/lite/src/lite_kernel.h" #include "mindspore/lite/src/lite_kernel.h"
#include "mindspore/lite/src/common/file_utils.h"
namespace mindspore { namespace mindspore {
class TestBatchnormFp32 : public mindspore::Common { class TestBatchnormFp32 : public mindspore::Common {
public: public:
TestBatchnormFp32() {} TestBatchnormFp32() {}
}; };
TEST_F(TestBatchnormFp32, BNTest) { TEST_F(TestBatchnormFp32, BNTest) {
std::vector<float> in_data = {0.0669681, 0.959215, 0.252686, 0.613594, 0.811776, 0.139469, 0.322848, 0.118354, std::vector<float> in_data = {-11.18675, 11.433986, 11.386012, 11.245945, -2.7614849, 14.692399,
0.082978, 0.399467, 0.961267, 0.0247456, 0.0714259, 0.0791484, 0.0648625, 0.561612, -1.1983503, -6.6790967, 6.383416, -13.3213005, -8.693595, 9.476344};
0.412069, 0.311492, 0.46109, 0.377125, 0.369283, 0.0332446, 0.696142, 0.715973, std::vector<float> in_data1 = {12.352293, 5.122387, 14.249514};
0.525524, 0.477265, 0.0336351, 0.751577, 0.377548, 0.964603, 0.0196834, 0.174865}; std::vector<float> in_data2 = {14.632595, 0.70900035, 11.179003};
std::vector<float> in_data1 = {0.855446, 0.821765, 0.281008, 0.0798653, 0.22294, 0.793782, 0.963222, 0.17851,
0.667549, 0.274381, 0.592842, 0.216552, 0.190274, 0.237873, 0.610063, 0.307559,
0.830007, 0.760957, 0.583265, 0.763793, 0.456372, 0.391378, 0.547915, 0.862198,
0.510794, 0.826776, 0.515894, 0.30071, 0.404987, 0.184773};
std::vector<float> in_data2 = {0.712438, 0.4927, 0.078419, 0.310429, 0.546871, 0.0667141, 0.874321, 0.0265647,
0.685165, 0.732586, 0.952889, 0.506402, 0.540784, 0.131119, 0.357713, 0.678992,
0.960839, 0.340706, 0.697678, 0.398146, 0.313321, 0.6485, 0.739153, 0.00190134,
0.536842, 0.996873, 0.445276, 0.371212, 0.420397, 0.0930115};
std::vector<float> in_data3(32, 1);
std::vector<float> in_data4(32, 0);
std::vector<lite::tensor::Tensor *> inputs_tensor; std::vector<lite::tensor::Tensor *> inputs_tensor;
std::vector<lite::tensor::Tensor *> outputs_tensor; std::vector<lite::tensor::Tensor *> outputs_tensor;
...@@ -51,8 +38,7 @@ TEST_F(TestBatchnormFp32, BNTest) { ...@@ -51,8 +38,7 @@ TEST_F(TestBatchnormFp32, BNTest) {
op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm; op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm;
op_param.epsilon_ = 0.001f; op_param.epsilon_ = 0.001f;
std::vector<int> in_shape = {1, 2, 4, 4}; std::vector<int> shape = {1, 2, 2, 3};
lite::tensor::Tensor input0_tensor; lite::tensor::Tensor input0_tensor;
lite::tensor::Tensor input1_tensor; lite::tensor::Tensor input1_tensor;
lite::tensor::Tensor input2_tensor; lite::tensor::Tensor input2_tensor;
...@@ -62,39 +48,40 @@ TEST_F(TestBatchnormFp32, BNTest) { ...@@ -62,39 +48,40 @@ TEST_F(TestBatchnormFp32, BNTest) {
input0_tensor.SetData(in_data.data()); input0_tensor.SetData(in_data.data());
input1_tensor.SetData(in_data1.data()); input1_tensor.SetData(in_data1.data());
input2_tensor.SetData(in_data2.data()); input2_tensor.SetData(in_data2.data());
input0_tensor.set_shape(in_shape); input0_tensor.set_shape(shape);
input1_tensor.set_shape({3});
input2_tensor.set_shape({3});
std::vector<float> output(32); std::vector<float> output(12);
std::vector<float> corr_out(32); std::vector<float> corr_out = {-6.1533737, 7.4904885, -0.8563998, -0.289212, -9.356432, 0.13245535,
std::vector<int> output_shape = {1, 2, 4, 4}; -3.5422924, -14.005781, -2.3525476, -6.7113695, -16.396551, -1.4275324};
lite::tensor::Tensor output0_tensor; lite::tensor::Tensor output0_tensor;
outputs_tensor.push_back(&output0_tensor); outputs_tensor.push_back(&output0_tensor);
output0_tensor.SetData(output.data()); output0_tensor.SetData(output.data());
output0_tensor.set_shape(shape);
kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_BatchNorm}; kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_BatchNorm};
auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc); auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
ASSERT_NE(creator, nullptr); ASSERT_NE(creator, nullptr);
lite::Context ctx; lite::Context ctx;
ctx.thread_num_ = 7; ctx.thread_num_ = 1;
kernel::LiteKernel *kernel = kernel::LiteKernel *kernel =
creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr); creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr);
ASSERT_NE(kernel, nullptr); ASSERT_NE(kernel, nullptr);
auto output_tensor_shape = output0_tensor.shape(); auto output_tensor_shape = output0_tensor.shape();
kernel->Run(); kernel->Run();
FusedBatchNorm(in_data.data(), in_data3.data(), in_data4.data(), in_data1.data(), in_data2.data(), in_shape.data(),
0.001f, corr_out.data());
printf("==================output data=================\n"); printf("==================output data=================\n");
for (int i = 0; i < 1 * 28; i++) { for (int i = 0; i < output0_tensor.ElementsNum(); i++) {
std::cout << output[i] << " ,"; std::cout << output[i] << " ,";
} }
std::cout << std::endl; std::cout << std::endl;
CompareOutputData(output.data(), corr_out.data(), 32, 0.00001); CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001);
input0_tensor.SetData(nullptr); input0_tensor.SetData(nullptr);
input1_tensor.SetData(nullptr); input1_tensor.SetData(nullptr);
input2_tensor.SetData(nullptr); input2_tensor.SetData(nullptr);
output0_tensor.SetData(nullptr); output0_tensor.SetData(nullptr);
MS_LOG(INFO) << "TestBathNormFp32 accuracy passed";
} }
} // namespace mindspore } // namespace mindspore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册