提交 c0e7f422 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!3878 optimize lite arm cpu op: conv_depthwise, deconv_depthwise

Merge pull request !3878 from yangruoqi713/lite
......@@ -46,7 +46,6 @@ int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_output_, 0, pack_output_size * sizeof(float16_t));
return RET_OK;
}
......
......@@ -27,27 +27,7 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel {
int ConvolutionDepthwiseCPUKernel::Init() {
// conv base init
ConvolutionBaseCPUKernel::Init();
// init sliding window param
sliding_ = new SlidingWindowParam;
InitSlidingParam(sliding_, conv_param_, C4NUM);
// pack input function: convert_func_
auto input_tensor = inputs_[kInputIndex];
auto data_type = input_tensor->data_type();
auto input_format = input_tensor->GetFormat();
schema::Format execute_format = schema::Format_NHWC4;
if (input_format != execute_format) {
convert_func_ = LayoutTransform(data_type, input_format, execute_format);
if (convert_func_ == nullptr) {
MS_LOG(ERROR) << "layout convert func is nullptr.";
return RET_ERROR;
}
}
int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = inputs_[kWeightIndex];
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
......@@ -55,42 +35,93 @@ int ConvolutionDepthwiseCPUKernel::Init() {
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
conv_param_->output_channel_);
// init bias
bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
if (inputs_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float));
} else {
MS_ASSERT(inputs_.size() == kInputSize1);
}
// init threadNum;
conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
ReSize();
return RET_OK;
}
int ConvolutionDepthwiseCPUKernel::ReSize() {
// malloc pack input buffer
if (convert_func_ != nullptr) {
int ConvolutionDepthwiseCPUKernel::InitBuffer() {
// malloc pack input and output buffer
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_input_, 0, pack_input_size * sizeof(float));
}
// malloc tmp output buffer
if (conv_param_->output_channel_ % C4NUM != 0) {
need_align_ = true;
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float)));
memset(packed_output_, 0, pack_output_size * sizeof(float));
if (packed_output_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
int ConvolutionDepthwiseCPUKernel::Init() {
// conv base init
ConvolutionBaseCPUKernel::Init();
// init sliding window param
sliding_ = new SlidingWindowParam;
InitSlidingParam(sliding_, conv_param_, C4NUM);
auto ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed.";
return RET_ERROR;
}
ret = InitBuffer();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
int ConvolutionDepthwiseCPUKernel::ReSize() {
if (need_align_) {
free(packed_input_);
free(packed_output_);
}
// conv base init
ConvolutionBaseCPUKernel::Init();
// init sliding window param
sliding_ = new SlidingWindowParam;
InitSlidingParam(sliding_, conv_param_, C4NUM);
auto ret = InitBuffer();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
......@@ -120,15 +151,14 @@ int ConvolutionDepthwiseCPUKernel::Run() {
auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
// pack input: to nhwc4
if (convert_func_ != nullptr) {
convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_,
conv_param_->input_channel_);
if (need_align_) {
PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = input_addr;
}
output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float));
auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
if (!need_align_) {
packed_output_ = output_addr;
}
......@@ -146,7 +176,6 @@ int ConvolutionDepthwiseCPUKernel::Run() {
return RET_OK;
}
kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const Context *ctx,
......@@ -170,4 +199,3 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, CpuConvDwFp32KernelCreator)
} // namespace mindspore::kernel
......@@ -31,10 +31,8 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
~ConvolutionDepthwiseCPUKernel() override {
delete sliding_;
free(packed_weight_);
if (convert_func_ != nullptr) {
free(packed_input_);
}
if (need_align_) {
free(packed_input_);
free(packed_output_);
}
};
......@@ -43,6 +41,8 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
int ReSize() override;
int Run() override;
int InitBuffer();
int InitWeightBias();
int Execute(int task_id);
private:
......@@ -50,7 +50,6 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
float *packed_weight_;
float *packed_input_;
float *packed_output_;
float *output_addr;
bool need_align_ = false;
};
} // namespace mindspore::kernel
......
......@@ -43,24 +43,7 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
return RET_OK;
}
int DeconvolutionDepthwiseCPUKernel::Init() {
InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init();
// pack input function: convert_func_
auto input_tensor = inputs_[kInputIndex];
auto data_type = input_tensor->data_type();
auto input_format = input_tensor->GetFormat();
schema::Format execute_format = schema::Format_NHWC4;
if (input_format != execute_format) {
convert_func_ = LayoutTransform(data_type, input_format, execute_format);
if (convert_func_ == nullptr) {
MS_LOG(ERROR) << "layout convert func is nullptr.";
return RET_ERROR;
}
}
int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = inputs_[kWeightIndex];
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
......@@ -68,55 +51,102 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
conv_param_->output_channel_);
// init bias
bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
if (inputs_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float));
} else {
MS_ASSERT(inputs_.size() == kInputSize1);
}
// init threadNum;
conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4);
ReSize();
return RET_OK;
}
int DeconvolutionDepthwiseCPUKernel::ReSize() {
// malloc pack input buffer
if (convert_func_ != nullptr) {
int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
// malloc pack input and output buffer
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_input_, 0, pack_input_size * sizeof(float));
}
// malloc tmp output buffer
if (conv_param_->output_channel_ % C4NUM != 0) {
need_pack_ = true;
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float)));
if (packed_output_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_output_, 0, pack_output_size * sizeof(float));
}
return RET_OK;
}
int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) {
int DeconvolutionDepthwiseCPUKernel::Init() {
InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init();
auto ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.";
return RET_ERROR;
}
ret = InitBuffer();
if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
int DeconvolutionDepthwiseCPUKernel::ReSize() {
if (need_align_) {
free(packed_input_);
free(packed_output_);
}
InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init();
auto ret = InitBuffer();
if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.";
return RET_ERROR;
}
return RET_OK;
}
int DeconvolutionDepthwiseCPUKernel::Execute(int task_id) {
DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_,
sliding_, task_id);
return RET_OK;
}
int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
auto conv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata);
auto ret = conv_dw->DoExcute(task_id);
auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata);
auto ret = deconv_dw->Execute(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "DeconvolutionDepthwiseRun error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
......@@ -133,26 +163,26 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
// pack input: to nhwc4
if (convert_func_ != nullptr) {
convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_,
conv_param_->input_channel_);
if (need_align_) {
PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = input_addr;
}
output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float));
if (!need_pack_) {
auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
if (!need_align_) {
memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float));
packed_output_ = output_addr;
}
auto ret = LiteBackendParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]";
MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]";
return RET_ERROR;
}
if (need_pack_) {
if (need_align_) {
PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}
......@@ -182,4 +212,3 @@ kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vector<lite::tensor:
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DeDepthwiseConv2D, CpuDeconvDwFp32KernelCreator)
} // namespace mindspore::kernel
......@@ -31,8 +31,10 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
~DeconvolutionDepthwiseCPUKernel() override {
delete sliding_;
free(packed_weight_);
free(packed_input_);
free(packed_output_);
if (need_align_) {
free(packed_input_);
free(packed_output_);
}
};
int Init() override;
......@@ -40,17 +42,17 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
int ReSize() override;
int Run() override;
int DoExcute(int task_id);
int InitBuffer();
int InitWeightBias();
int Execute(int task_id);
private:
SlidingWindowParam *sliding_;
float *packed_weight_;
float *packed_input_;
float *packed_output_;
float *output_addr;
bool need_pack_ = false;
bool need_align_ = false;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_DEPTHWISE_H_
......@@ -35,11 +35,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t));
PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_);
// init bias, add output zp
bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
if (inputs_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data());
......@@ -48,6 +56,30 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
return RET_OK;
}
int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
// malloc packed input buffer
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
UP_DIV(conv_param_->input_channel_, 4);
packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
UP_DIV(conv_param_->output_channel_, C4NUM);
packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
}
return RET_OK;
}
int ConvolutionDepthwiseInt8CPUKernel::Init() {
// conv base init
ConvolutionBaseCPUKernel::Init();
......@@ -66,7 +98,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
return ret;
}
ret = ReSize();
ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
return ret;
......@@ -75,26 +107,23 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
}
int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
// malloc packed input buffer
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
UP_DIV(conv_param_->input_channel_, 4);
packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
free(packed_input_);
if (need_align_) {
free(packed_output_);
}
// conv base init
ConvolutionBaseCPUKernel::Init();
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
(conv_param_->output_channel_, C4NUM);
packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_output_, 0, pack_output_size * sizeof(int8_t));
// init sliding window param
InitSlidingParam(sliding, conv_param_, C4NUM);
// init quant param
ConvolutionBaseCPUKernel::SetQuantParam();
auto ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
return ret;
}
return RET_OK;
}
......@@ -106,8 +135,8 @@ int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
}
int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
auto conv_dw = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata);
auto ret = conv_dw->Execute(task_id);
auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata);
auto ret = conv_dw_int8->Execute(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
......@@ -127,7 +156,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data());
memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t));
if (!need_align_) {
packed_output_ = output_addr;
}
......
......@@ -42,6 +42,7 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
int Run() override;
int InitWeightBias();
int InitBuffer();
int Execute(int task_id);
private:
......
......@@ -35,11 +35,19 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t));
PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_);
// init bias, add output zp
bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
if (inputs_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data());
......@@ -59,7 +67,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C);
// init sliding window param
sliding = new SlidingWindowParam;
InitSlidingParam(sliding, conv_param_, C4NUM);
sliding->in_h_step_ = conv_param_->input_w_ * C4NUM;
......@@ -70,31 +77,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
return RET_OK;
}
int DeconvolutionDepthwiseInt8CPUKernel::Init() {
InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init();
// init quant param
ConvolutionBaseCPUKernel::SetQuantParam();
// init weight and bias
auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!";
return ret;
}
ret = ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Deconv Depthwise int8 ReSize error!";
return ret;
}
return RET_OK;
}
int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
// malloc packed input buffer
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
UP_DIV(conv_param_->input_channel_, 4);
......@@ -108,9 +91,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true;
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
(conv_param_->output_channel_, C4NUM);
UP_DIV(conv_param_->output_channel_, C4NUM);
packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
if (packed_input_ == nullptr) {
if (packed_output_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
......@@ -120,6 +103,10 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
// malloc tmp buffer for int32 output
output_buffer =
reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
if (output_buffer == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
......@@ -127,6 +114,49 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
return RET_OK;
}
int DeconvolutionDepthwiseInt8CPUKernel::Init() {
sliding = new SlidingWindowParam;
InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init();
// init quant param
ConvolutionBaseCPUKernel::SetQuantParam();
// init weight and bias
auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!";
return ret;
}
ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!";
return ret;
}
return RET_OK;
}
int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
free(packed_input_);
if (need_align_) {
free(packed_output_);
}
InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init();
auto ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!";
return ret;
}
return RET_OK;
}
int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
DeconvDwInt8(packed_output_, output_buffer, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
conv_param_, sliding, task_id);
......@@ -134,8 +164,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
}
int DeconvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata);
auto ret = deconv_dw->Execute(task_id);
auto deconv_dw_int8 = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata);
auto ret = deconv_dw_int8->Execute(task_id);
if (ret != RET_OK) {
MS_LOG(ERROR) << "DeconvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
return RET_ERROR;
......@@ -155,8 +185,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Run() {
PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data());
memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t));
if (!need_align_) {
memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t));
packed_output_ = output_addr;
}
......
......@@ -43,6 +43,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
int InitSlideParam();
int InitWeightBias();
int InitBuffer();
int Execute(int task_id);
private:
......
......@@ -21,6 +21,9 @@
void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu,
bool is_relu6) {
for (int c = 0; c < C8NUM; c++) {
dst[c] = 0;
}
const float16_t *src_kh = src;
const float16_t *weight_kh = weight;
for (int kh = 0; kh < height; kh++) {
......@@ -87,6 +90,9 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
for (int ow = 0; ow < width; ow++) {
const float16_t *src_kh = src_w;
const float16_t *weight_kh = weight;
for (int c = 0; c < C8NUM; c++) {
dst_w[c] = 0;
}
for (int kh = 0; kh < kernel_h; kh++) {
const float16_t *src_kw = src_kh;
const float16_t *weight_kw = weight_kh;
......@@ -297,4 +303,3 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
// output nchwc8
}
/*deconv depthwise fp16 end*/
......@@ -63,6 +63,9 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con
int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) {
const float *src_kh = src;
const float *weight_kh = weight;
for (int c = 0; c < C4NUM; c++) {
dst[c] = 0;
}
for (int kh = 0; kh < height; kh++) {
const float *src_kw = src_kh;
const float *weight_kw = weight_kh;
......@@ -132,6 +135,9 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
for (int ow = 0; ow < width; ow++) {
const float *src_kh = src_w;
const float *weight_kh = weight;
for (int c = 0; c < C4NUM; c++) {
dst_w[c] = 0;
}
for (int kh = 0; kh < kernel_h; kh++) {
const float *src_kw = src_kh;
const float *weight_kw = weight_kh;
......@@ -202,7 +208,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
src += sliding->in_step_;
dst += sliding->out_step_;
} // batch loop
// output nc4hwc4
// output nhwc4
}
/*conv depthwise fp32 end*/
......@@ -350,6 +356,6 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we
src += sliding->in_step_;
dst += sliding->out_step_;
} // batch loop
// output nc4hwc4
// output nhwc4
}
/*deconv depthwise fp32 end*/
......@@ -171,7 +171,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
src += sliding->in_step_;
dst += sliding->out_step_;
} // batch loop
// output nc4hwc4
// output nhwc4
}
/*conv depthwise int8 end*/
......@@ -317,6 +317,6 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
src += sliding->in_step_;
dst += sliding->out_step_;
} // batch loop
// output nc4hwc4
// output nhwc4
}
/*deconv depthwise int8 end*/
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册