提交 ba588e20 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4692 [MS][LITE] optimize arm cpu op: conv depthwise

Merge pull request !4692 from yangruoqi713/lite
...@@ -29,66 +29,67 @@ using mindspore::lite::RET_OK; ...@@ -29,66 +29,67 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() { FreeTmpBuffer(); } ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() {
void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
if (sliding_ != nullptr) { if (sliding_ != nullptr) {
delete sliding_; delete sliding_;
sliding_ = nullptr; sliding_ = nullptr;
} }
if (packed_weight_ != nullptr) { if (packed_weight_ != nullptr) {
delete packed_weight_; delete packed_weight_;
packed_weight_ = nullptr; packed_weight_ = nullptr;
} }
if (packed_input_ != nullptr) { FreeTmpBuffer();
delete packed_input_; }
packed_input_ = nullptr;
} void ConvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
if (packed_output_ != nullptr) { if (need_align_) {
delete packed_output_; if (packed_input_ != nullptr) {
packed_output_ = nullptr; delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
} }
} }
int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() { int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
// malloc pack input buffer if (conv_param_->input_channel_ % C4NUM != 0) {
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); need_align_ = true;
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t))); int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
if (packed_input_ == nullptr) { packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
MS_LOG(ERROR) << "Malloc buffer failed."; if (packed_input_ == nullptr) {
return RET_ERROR; MS_LOG(ERROR) << "Malloc buffer failed.";
} return RET_ERROR;
memset(packed_input_, 0, pack_input_size * sizeof(float16_t)); }
// malloc pack output buffer int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t))); if (packed_output_ == nullptr) {
if (packed_output_ == nullptr) { MS_LOG(ERROR) << "Malloc buffer failed.";
MS_LOG(ERROR) << "Malloc buffer failed."; return RET_ERROR;
return RET_ERROR; }
} }
return RET_OK; return RET_OK;
} }
int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1 // init weight: o, h, w, i; o == group, i == 1
int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
auto weight_tensor = in_tensors_[kWeightIndex]; auto weight_tensor = in_tensors_[kWeightIndex];
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, weight_tensor->Batch());
conv_param_->output_channel_);
// init bias
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
...@@ -97,8 +98,9 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { ...@@ -97,8 +98,9 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_); auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) { if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data()); auto bias_tensor = in_tensors_.at(kBiasIndex);
for (int i = 0; i < conv_param_->output_channel_; i++) { auto ori_bias = reinterpret_cast<float *>(bias_tensor->Data());
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
bias_fp16[i] = (float16_t)ori_bias[i]; bias_fp16[i] = (float16_t)ori_bias[i];
} }
} }
...@@ -108,6 +110,18 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { ...@@ -108,6 +110,18 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
} }
int ConvolutionDepthwiseFp16CPUKernel::Init() { int ConvolutionDepthwiseFp16CPUKernel::Init() {
sliding_ = new (std::nothrow) SlidingWindowParam;
if (sliding_ == nullptr) {
MS_LOG(ERROR) << "new sliding window param failed.";
return RET_ERROR;
}
auto ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) { if (!InferShapeDone()) {
return RET_OK; return RET_OK;
} }
...@@ -116,21 +130,12 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() { ...@@ -116,21 +130,12 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
int ConvolutionDepthwiseFp16CPUKernel::ReSize() { int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
FreeTmpBuffer(); FreeTmpBuffer();
// conv base init
auto ret = ConvolutionBaseCPUKernel::Init(); auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) { if (ret != RET_OK) {
return ret; return ret;
} }
// init sliding_ window param
sliding_ = new SlidingWindowParam;
InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp16 InitWeightBias failed.";
return RET_ERROR;
}
ret = InitBuffer(); ret = InitBuffer();
if (ret != 0) { if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed."; MS_LOG(ERROR) << "Convolution depthwise fp16 InitBuffer failed.";
...@@ -171,19 +176,25 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { ...@@ -171,19 +176,25 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
MS_LOG(ERROR) << "Get Execute tensor failed."; MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret; return ret;
} }
// pack input: to nhwc8 if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = execute_input_;
}
if (!need_align_) {
packed_output_ = execute_output_;
}
ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
return RET_ERROR; return RET_ERROR;
} }
if (need_align_) {
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
......
...@@ -56,6 +56,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { ...@@ -56,6 +56,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
float16_t *packed_weight_ = nullptr; float16_t *packed_weight_ = nullptr;
float16_t *packed_input_ = nullptr; float16_t *packed_input_ = nullptr;
float16_t *packed_output_ = nullptr; float16_t *packed_output_ = nullptr;
bool need_align_ = false;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -28,25 +28,28 @@ using mindspore::lite::RET_OK; ...@@ -28,25 +28,28 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() { FreeTmpBuffer(); } DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() {
void DeconvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
if (sliding_ != nullptr) { if (sliding_ != nullptr) {
delete sliding_; delete sliding_;
sliding_ = nullptr; sliding_ = nullptr;
} }
if (packed_weight_ != nullptr) { if (packed_weight_ != nullptr) {
delete packed_weight_; delete packed_weight_;
packed_weight_ = nullptr; packed_weight_ = nullptr;
} }
if (packed_input_ != nullptr) { FreeTmpBuffer();
delete packed_input_; }
packed_input_ = nullptr;
} void DeconvolutionDepthwiseFp16CPUKernel::FreeTmpBuffer() {
if (packed_output_ != nullptr) { if (need_align_) {
delete packed_output_; if (packed_input_ != nullptr) {
packed_output_ = nullptr; delete packed_input_;
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
delete packed_output_;
packed_output_ = nullptr;
}
} }
} }
...@@ -59,14 +62,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() { ...@@ -59,14 +62,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H); conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H);
conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W); conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W);
conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);
// init sliding_ window param
InitSlidingParamConvDw(sliding_, conv_param_, C8NUM); InitSlidingParamConvDw(sliding_, conv_param_, C8NUM);
return RET_OK; return RET_OK;
} }
int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
// malloc pack input buffer
int C8 = UP_DIV(conv_param_->input_channel_, C8NUM); int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8; int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C8NUM * C8;
packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t))); packed_input_ = reinterpret_cast<float16_t *>(malloc(pack_input_size * sizeof(float16_t)));
...@@ -74,7 +74,6 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { ...@@ -74,7 +74,6 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_input_, 0, pack_input_size * sizeof(float16_t));
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8; int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C8NUM * C8;
packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t))); packed_output_ = reinterpret_cast<float16_t *>(malloc(pack_output_size * sizeof(float16_t)));
...@@ -88,21 +87,19 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() { ...@@ -88,21 +87,19 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitBuffer() {
int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1 // init weight: o, h, w, i; o == group, i == 1
int OC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
auto weight_tensor = in_tensors_[kWeightIndex]; auto weight_tensor = in_tensors_[kWeightIndex];
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
int pack_weight_size = C8NUM * OC8 * conv_param_->kernel_h_ * conv_param_->kernel_w_; int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, weight_tensor->Batch());
conv_param_->output_channel_);
// init bias
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
...@@ -110,8 +107,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { ...@@ -110,8 +107,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
} }
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
if (in_tensors_.size() == kInputSize2) { if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data()); auto bias_tensor = in_tensors_.at(kBiasIndex);
for (int i = 0; i < conv_param_->output_channel_; i++) { auto ori_bias = reinterpret_cast<float *>(bias_tensor->Data());
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i]; reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i];
} }
} }
...@@ -121,6 +119,17 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { ...@@ -121,6 +119,17 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
} }
int DeconvolutionDepthwiseFp16CPUKernel::Init() { int DeconvolutionDepthwiseFp16CPUKernel::Init() {
sliding_ = new (std::nothrow) SlidingWindowParam;
if (sliding_ == nullptr) {
MS_LOG(ERROR) << "new SlidingWindowParam fail!";
return RET_ERROR;
}
auto ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) { if (!InferShapeDone()) {
return RET_OK; return RET_OK;
} }
...@@ -129,25 +138,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() { ...@@ -129,25 +138,11 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() {
int DeconvolutionDepthwiseFp16CPUKernel::ReSize() { int DeconvolutionDepthwiseFp16CPUKernel::ReSize() {
FreeTmpBuffer(); FreeTmpBuffer();
sliding_ = new (std::nothrow) SlidingWindowParam;
if (sliding_ == nullptr) {
MS_LOG(ERROR) << "new SlidingWindowParam fail!";
return RET_ERROR;
}
InitSlideParam(); InitSlideParam();
// conv base init
auto ret = ConvolutionBaseCPUKernel::Init(); auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) { if (ret != RET_OK) {
return ret; return ret;
} }
ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitWeightBias failed.";
return RET_ERROR;
}
ret = InitBuffer(); ret = InitBuffer();
if (ret != 0) { if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed."; MS_LOG(ERROR) << "Deconvolution depthwise fp16 InitBuffer failed.";
...@@ -188,18 +183,26 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { ...@@ -188,18 +183,26 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
MS_LOG(ERROR) << "Get Execute tensor failed."; MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret; return ret;
} }
// pack input: to nhwc8 if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = execute_input_;
}
if (!need_align_) {
packed_output_ = execute_output_;
}
ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]"; MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
return RET_ERROR; return RET_ERROR;
} }
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, if (need_align_) {
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
......
...@@ -57,6 +57,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel ...@@ -57,6 +57,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel
float16_t *packed_weight_ = nullptr; float16_t *packed_weight_ = nullptr;
float16_t *packed_input_ = nullptr; float16_t *packed_input_ = nullptr;
float16_t *packed_output_ = nullptr; float16_t *packed_output_ = nullptr;
bool need_align_ = false;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -29,18 +29,19 @@ using mindspore::lite::RET_OK; ...@@ -29,18 +29,19 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() { FreeTmpBuffer(); } ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() {
void ConvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
if (sliding_ != nullptr) { if (sliding_ != nullptr) {
delete sliding_; delete sliding_;
sliding_ = nullptr; sliding_ = nullptr;
} }
if (packed_weight_ != nullptr) { if (packed_weight_ != nullptr) {
delete packed_weight_; delete packed_weight_;
packed_weight_ = nullptr; packed_weight_ = nullptr;
} }
FreeTmpBuffer();
}
void ConvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
if (need_align_) { if (need_align_) {
if (packed_input_ != nullptr) { if (packed_input_ != nullptr) {
delete packed_input_; delete packed_input_;
...@@ -57,19 +58,17 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() { ...@@ -57,19 +58,17 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1 // init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = in_tensors_[kWeightIndex]; auto weight_tensor = in_tensors_[kWeightIndex];
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_weight_, 0, pack_weight_size * sizeof(float)); PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, weight_tensor->Batch());
conv_param_->output_channel_);
// init bias
bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float))); bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
...@@ -78,16 +77,14 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() { ...@@ -78,16 +77,14 @@ int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
if (in_tensors_.size() == kInputSize2) { if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data()); auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float));
} }
// init threadNum;
conv_param_->thread_num_ = MSMIN(thread_count_, OC4); conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
return RET_OK; return RET_OK;
} }
int ConvolutionDepthwiseCPUKernel::InitBuffer() { int ConvolutionDepthwiseCPUKernel::InitBuffer() {
// malloc pack input and output buffer
if (conv_param_->input_channel_ % C4NUM != 0) { if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true; need_align_ = true;
int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
...@@ -97,7 +94,6 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() { ...@@ -97,7 +94,6 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_input_, 0, pack_input_size * sizeof(float));
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
...@@ -111,32 +107,29 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() { ...@@ -111,32 +107,29 @@ int ConvolutionDepthwiseCPUKernel::InitBuffer() {
} }
int ConvolutionDepthwiseCPUKernel::Init() { int ConvolutionDepthwiseCPUKernel::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int ConvolutionDepthwiseCPUKernel::ReSize() {
FreeTmpBuffer();
// conv base init
ConvolutionBaseCPUKernel::Init();
// init sliding window param
sliding_ = new (std::nothrow) SlidingWindowParam; sliding_ = new (std::nothrow) SlidingWindowParam;
if (sliding_ == nullptr) { if (sliding_ == nullptr) {
MS_LOG(ERROR) << "new sliding window param failed."; MS_LOG(ERROR) << "new sliding window param failed.";
return RET_ERROR; return RET_ERROR;
} }
InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
auto ret = InitWeightBias(); auto ret = InitWeightBias();
if (ret != 0) { if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed."; MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed.";
return RET_ERROR; return RET_ERROR;
} }
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int ConvolutionDepthwiseCPUKernel::ReSize() {
FreeTmpBuffer();
ConvolutionBaseCPUKernel::Init();
InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
ret = InitBuffer(); auto ret = InitBuffer();
if (ret != 0) { if (ret != 0) {
MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed."; MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
return RET_ERROR; return RET_ERROR;
...@@ -173,7 +166,6 @@ int ConvolutionDepthwiseCPUKernel::Run() { ...@@ -173,7 +166,6 @@ int ConvolutionDepthwiseCPUKernel::Run() {
auto input_tensor = in_tensors_.at(kInputIndex); auto input_tensor = in_tensors_.at(kInputIndex);
auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
// pack input: to nhwc4
if (need_align_) { if (need_align_) {
PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
......
...@@ -27,12 +27,41 @@ using mindspore::lite::RET_OK; ...@@ -27,12 +27,41 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
ConvolutionDepthwise3x3CPUKernel::~ConvolutionDepthwise3x3CPUKernel() {
FreeTmpBufer();
if (block_buffer_ != nullptr) {
free(block_buffer_);
block_buffer_ = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
}
}
void ConvolutionDepthwise3x3CPUKernel::FreeTmpBufer() {
if (need_align_) {
if (packed_input_ != nullptr) {
free(packed_input_);
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
free(packed_output_);
packed_output_ = nullptr;
}
}
if (trans_buffer_ != nullptr) {
free(trans_buffer_);
trans_buffer_ = nullptr;
}
}
int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() { int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1 // init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = in_tensors_[kWeightIndex]; auto weight_tensor = in_tensors_[kWeightIndex];
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
// o h w 1 -> o/4 h w 1 4 // o h w 1 -> o/4 h w 1 4
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int weight_c4_size = OC4 * C4NUM * 9; int weight_c4_size = OC4 * C4NUM * 9;
auto tmp_weight = reinterpret_cast<float *>(malloc(weight_c4_size * sizeof(float))); auto tmp_weight = reinterpret_cast<float *>(malloc(weight_c4_size * sizeof(float)));
if (tmp_weight == nullptr) { if (tmp_weight == nullptr) {
...@@ -40,8 +69,8 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() { ...@@ -40,8 +69,8 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
return RET_ERROR; return RET_ERROR;
} }
memset(tmp_weight, 0, weight_c4_size * sizeof(float)); memset(tmp_weight, 0, weight_c4_size * sizeof(float));
PackNCHWToNC4HW4Fp32(origin_weight, tmp_weight, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, PackNCHWToNC4HW4Fp32(origin_weight, tmp_weight, 1, weight_tensor->Height() * weight_tensor->Width(),
conv_param_->output_channel_); weight_tensor->Batch());
// weight transform // weight transform
int packed_weight_size = OC4 * C4NUM * 16; int packed_weight_size = OC4 * C4NUM * 16;
...@@ -62,8 +91,9 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() { ...@@ -62,8 +91,9 @@ int ConvolutionDepthwise3x3CPUKernel::InitWeightBias() {
memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
if (in_tensors_.size() == kInputSize2) { if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data()); auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float));
} }
conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
return RET_OK; return RET_OK;
} }
...@@ -106,48 +136,22 @@ int ConvolutionDepthwise3x3CPUKernel::Init() { ...@@ -106,48 +136,22 @@ int ConvolutionDepthwise3x3CPUKernel::Init() {
MS_LOG(ERROR) << "malloc block buffer failed."; MS_LOG(ERROR) << "malloc block buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise3x3 fp32 initWeightBias error!ret: " << ret;
return ret;
}
if (!InferShapeDone()) { if (!InferShapeDone()) {
return RET_OK; return RET_OK;
} }
return ReSize(); return ReSize();
} }
void ConvolutionDepthwise3x3CPUKernel::FreeTmpBufer() {
if (need_align_) {
if (packed_input_ != nullptr) {
free(packed_input_);
packed_input_ = nullptr;
}
if (packed_output_ != nullptr) {
free(packed_output_);
packed_output_ = nullptr;
}
}
if (trans_buffer_ != nullptr) {
free(trans_buffer_);
trans_buffer_ = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
}
}
int ConvolutionDepthwise3x3CPUKernel::ReSize() { int ConvolutionDepthwise3x3CPUKernel::ReSize() {
FreeTmpBufer(); FreeTmpBufer();
// conv base init
ConvolutionBaseCPUKernel::Init(); ConvolutionBaseCPUKernel::Init();
auto ret = InitWeightBias(); auto ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise3x3 fp32 initWeightBias error!ret: " << ret;
return ret;
}
// init threadNum;
conv_param_->thread_num_ = MSMIN(thread_count_, UP_DIV(conv_param_->output_channel_, C4NUM));
ret = InitBuffer();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise3x3 fp32 initBuffer error!ret: " << ret; MS_LOG(ERROR) << "Depthwise3x3 fp32 initBuffer error!ret: " << ret;
return ret; return ret;
......
...@@ -30,13 +30,7 @@ class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel { ...@@ -30,13 +30,7 @@ class ConvolutionDepthwise3x3CPUKernel : public ConvolutionBaseCPUKernel {
const mindspore::lite::PrimitiveC *primitive) const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwise3x3CPUKernel() override { ~ConvolutionDepthwise3x3CPUKernel() override;
FreeTmpBufer();
if (block_buffer_ != nullptr) {
free(block_buffer_);
block_buffer_ = nullptr;
}
};
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
......
...@@ -27,18 +27,19 @@ using mindspore::lite::RET_OK; ...@@ -27,18 +27,19 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() { FreeTmpBuffer(); } DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() {
void DeconvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
if (sliding_ != nullptr) { if (sliding_ != nullptr) {
delete sliding_; delete sliding_;
sliding_ = nullptr; sliding_ = nullptr;
} }
if (packed_weight_ != nullptr) { if (packed_weight_ != nullptr) {
delete packed_weight_; delete packed_weight_;
packed_weight_ = nullptr; packed_weight_ = nullptr;
} }
FreeTmpBuffer();
}
void DeconvolutionDepthwiseCPUKernel::FreeTmpBuffer() {
if (need_align_) { if (need_align_) {
if (packed_input_ != nullptr) { if (packed_input_ != nullptr) {
delete packed_input_; delete packed_input_;
...@@ -60,9 +61,6 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() { ...@@ -60,9 +61,6 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H); conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H);
conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W); conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W);
conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);
// init sliding window param
sliding_ = new SlidingWindowParam;
InitSlidingParamConvDw(sliding_, conv_param_, C4NUM); InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
return RET_OK; return RET_OK;
} }
...@@ -71,19 +69,17 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() { ...@@ -71,19 +69,17 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1 // init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = in_tensors_[kWeightIndex]; auto weight_tensor = in_tensors_[kWeightIndex];
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_weight_, 0, pack_weight_size * sizeof(float)); PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_, weight_tensor->Batch());
conv_param_->output_channel_);
// init bias
bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float))); bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
...@@ -92,16 +88,14 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() { ...@@ -92,16 +88,14 @@ int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
memset(bias_data_, 0, C4NUM * OC4 * sizeof(float)); memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
if (in_tensors_.size() == kInputSize2) { if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data()); auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->Data());
memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float)); memcpy(bias_data_, ori_bias, in_tensors_.at(kBiasIndex)->ElementsNum() * sizeof(float));
} }
// init threadNum; conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4);
return RET_OK; return RET_OK;
} }
int DeconvolutionDepthwiseCPUKernel::InitBuffer() { int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
// malloc pack input and output buffer
if (conv_param_->input_channel_ % C4NUM != 0) { if (conv_param_->input_channel_ % C4NUM != 0) {
need_align_ = true; need_align_ = true;
int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM); int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
...@@ -111,7 +105,6 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() { ...@@ -111,7 +105,6 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_input_, 0, pack_input_size * sizeof(float));
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4; int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
...@@ -126,6 +119,17 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() { ...@@ -126,6 +119,17 @@ int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
} }
int DeconvolutionDepthwiseCPUKernel::Init() { int DeconvolutionDepthwiseCPUKernel::Init() {
sliding_ = new (std::nothrow) SlidingWindowParam;
if (sliding_ == nullptr) {
MS_LOG(ERROR) << "new sliding window param failed.";
return RET_ERROR;
}
auto ret = InitWeightBias();
if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.ret: " << ret;
return ret;
}
if (!InferShapeDone()) { if (!InferShapeDone()) {
return RET_OK; return RET_OK;
} }
...@@ -135,16 +139,9 @@ int DeconvolutionDepthwiseCPUKernel::Init() { ...@@ -135,16 +139,9 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
int DeconvolutionDepthwiseCPUKernel::ReSize() { int DeconvolutionDepthwiseCPUKernel::ReSize() {
FreeTmpBuffer(); FreeTmpBuffer();
InitSlideParam(); InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init(); ConvolutionBaseCPUKernel::Init();
auto ret = InitWeightBias(); auto ret = InitBuffer();
if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.ret: " << ret;
return ret;
}
ret = InitBuffer();
if (ret != 0) { if (ret != 0) {
MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.ret: " << ret; MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.ret: " << ret;
return ret; return ret;
...@@ -181,7 +178,6 @@ int DeconvolutionDepthwiseCPUKernel::Run() { ...@@ -181,7 +178,6 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
auto input_tensor = in_tensors_.at(kInputIndex); auto input_tensor = in_tensors_.at(kInputIndex);
auto input_addr = reinterpret_cast<float *>(input_tensor->Data()); auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
// pack input: to nhwc4
if (need_align_) { if (need_align_) {
PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_, PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
......
...@@ -29,15 +29,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D; ...@@ -29,15 +29,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
if (sliding != nullptr) {
delete sliding;
sliding = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
}
if (packed_input_ != nullptr) { if (packed_input_ != nullptr) {
free(packed_input_); free(packed_input_);
packed_input_ = nullptr; packed_input_ = nullptr;
...@@ -51,6 +42,14 @@ void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { ...@@ -51,6 +42,14 @@ void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
} }
ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
if (sliding != nullptr) {
delete sliding;
sliding = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
}
FreeTmpBuffer(); FreeTmpBuffer();
FreeQuantParam(); FreeQuantParam();
} }
...@@ -58,18 +57,18 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { ...@@ -58,18 +57,18 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
// init weight, int8 -> int16 // init weight, int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_[kWeightIndex]->Data()); auto weight_tensor = in_tensors_[kWeightIndex];
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
// init bias, add output zp
bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
...@@ -77,18 +76,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { ...@@ -77,18 +76,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
} }
memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
if (in_tensors_.size() == kInputSize2) { if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->Data()); auto bias_tensor = in_tensors_.at(kBiasIndex);
memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(int32_t)); auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
} }
conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
return RET_OK; return RET_OK;
} }
int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
// malloc packed input buffer
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
UP_DIV(conv_param_->input_channel_, 4); UP_DIV(conv_param_->input_channel_, 4);
packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t))); packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
if (packed_input_ == nullptr) { if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
...@@ -108,6 +108,11 @@ int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() { ...@@ -108,6 +108,11 @@ int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
} }
int ConvolutionDepthwiseInt8CPUKernel::Init() { int ConvolutionDepthwiseInt8CPUKernel::Init() {
sliding = new (std::nothrow) SlidingWindowParam;
if (sliding == nullptr) {
MS_LOG(ERROR) << "new sliding window param.";
return RET_ERROR;
}
if (!InferShapeDone()) { if (!InferShapeDone()) {
return RET_OK; return RET_OK;
} }
...@@ -116,32 +121,19 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() { ...@@ -116,32 +121,19 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
int ConvolutionDepthwiseInt8CPUKernel::ReSize() { int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
FreeTmpBuffer(); FreeTmpBuffer();
// conv base init
ConvolutionBaseCPUKernel::Init(); ConvolutionBaseCPUKernel::Init();
// init sliding window param
sliding = new (std::nothrow) SlidingWindowParam;
if (sliding == nullptr) {
MS_LOG(ERROR) << "new sliding window param.";
return RET_ERROR;
}
InitSlidingParamConvDw(sliding, conv_param_, C4NUM); InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
// init quant param
auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Set quant param failed."; MS_LOG(ERROR) << "Set quant param failed.";
return ret; return ret;
} }
// init weight and bias
ret = InitWeightBias(); ret = InitWeightBias();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!"; MS_LOG(ERROR) << "Depthwise int8 InitWeightBias error!";
return ret; return ret;
} }
ret = InitBuffer(); ret = InitBuffer();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Depthwise int8 ReSize error!"; MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
...@@ -177,7 +169,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() { ...@@ -177,7 +169,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
return RET_ERROR; return RET_ERROR;
} }
// pack input, assume input format: NHWC -> NHWC4
auto input_tensor = in_tensors_.at(kInputIndex); auto input_tensor = in_tensors_.at(kInputIndex);
auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data()); auto input_addr = reinterpret_cast<int8_t *>(input_tensor->Data());
PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_); PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
......
...@@ -29,11 +29,6 @@ using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; ...@@ -29,11 +29,6 @@ using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
namespace mindspore::kernel { namespace mindspore::kernel {
DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() { DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() {
FreeTmpBuffer();
FreeQuantParam();
}
void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
if (sliding != nullptr) { if (sliding != nullptr) {
delete sliding; delete sliding;
sliding = nullptr; sliding = nullptr;
...@@ -42,6 +37,11 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { ...@@ -42,6 +37,11 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
delete packed_weight_; delete packed_weight_;
packed_weight_ = nullptr; packed_weight_ = nullptr;
} }
FreeTmpBuffer();
FreeQuantParam();
}
void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
if (packed_input_ != nullptr) { if (packed_input_ != nullptr) {
delete packed_input_; delete packed_input_;
packed_input_ = nullptr; packed_input_ = nullptr;
...@@ -61,18 +61,18 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { ...@@ -61,18 +61,18 @@ void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() { int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
// init weight: int8 -> int16 // init weight: int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1 // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_[kWeightIndex]->Data()); auto weight_tensor = in_tensors_[kWeightIndex];
int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM); auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->Data());
int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_; int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t))); packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
if (packed_weight_ == nullptr) { if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t)); PackDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_); weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
// init bias, add output zp
bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t))); bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
if (bias_data_ == nullptr) { if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
...@@ -80,9 +80,11 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() { ...@@ -80,9 +80,11 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
} }
memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t)); memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
if (in_tensors_.size() == kInputSize2) { if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->Data()); auto bias_tensor = in_tensors_.at(kBiasIndex);
memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(int32_t)); auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->Data());
memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
} }
conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
return RET_OK; return RET_OK;
} }
...@@ -96,7 +98,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { ...@@ -96,7 +98,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W); conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W);
conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C); conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);
// init sliding window param
InitSlidingParamConvDw(sliding, conv_param_, C4NUM); InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
sliding->in_h_step_ = conv_param_->input_w_ * C4NUM; sliding->in_h_step_ = conv_param_->input_w_ * C4NUM;
...@@ -108,11 +109,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() { ...@@ -108,11 +109,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
} }
int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
// malloc packed input buffer
int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
UP_DIV(conv_param_->input_channel_, 4); UP_DIV(conv_param_->input_channel_, 4);
packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t))); packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
if (packed_input_ == nullptr) { if (packed_input_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
...@@ -130,7 +129,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { ...@@ -130,7 +129,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
memset(packed_output_, 0, pack_output_size * sizeof(int8_t)); memset(packed_output_, 0, pack_output_size * sizeof(int8_t));
} }
// malloc tmp buffer for int32 output
output_buffer_ = output_buffer_ =
reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t))); reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
if (output_buffer_ == nullptr) { if (output_buffer_ == nullptr) {
...@@ -145,41 +143,33 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() { ...@@ -145,41 +143,33 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
} }
int DeconvolutionDepthwiseInt8CPUKernel::Init() { int DeconvolutionDepthwiseInt8CPUKernel::Init() {
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
FreeTmpBuffer();
sliding = new (std::nothrow) SlidingWindowParam; sliding = new (std::nothrow) SlidingWindowParam;
if (sliding == nullptr) { if (sliding == nullptr) {
MS_LOG(ERROR) << "new SlidingWindowParam fail!"; MS_LOG(ERROR) << "new SlidingWindowParam fail!";
return RET_ERROR; return RET_ERROR;
} }
InitSlideParam();
// conv base init
ConvolutionBaseCPUKernel::Init();
// init quant param
auto ret = ConvolutionBaseCPUKernel::SetQuantParam(); auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Set quant param failed."; MS_LOG(ERROR) << "Set quant param failed.";
return ret; return ret;
} }
// init weight and bias
ret = InitWeightBias(); ret = InitWeightBias();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!"; MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!";
return ret; return ret;
} }
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
FreeTmpBuffer();
InitSlideParam();
ConvolutionBaseCPUKernel::Init();
ret = InitBuffer(); auto ret = InitBuffer();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!"; MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!";
return ret; return ret;
......
...@@ -1035,18 +1035,18 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter ...@@ -1035,18 +1035,18 @@ void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter
} }
} }
void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, const ConvParameter *conv_param) { void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
int weight_zp = conv_param->conv_quant_arg_.filter_quant_args_[0].zp_; ConvQuantArg *quant_qrg) {
int unit = conv_param->kernel_h_ * conv_param->kernel_w_; int weight_zp = quant_qrg->filter_quant_args_[0].zp_;
for (int c = 0; c < conv_param->output_channel_; c++) { for (int c = 0; c < channel; c++) {
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { if (quant_qrg->per_channel_ & FILTER_PER_CHANNEL) {
weight_zp = conv_param->conv_quant_arg_.filter_quant_args_[c].zp_; weight_zp = quant_qrg->filter_quant_args_[c].zp_;
} }
int c4_block_num = c / C4NUM; int c4_block_num = c / C4NUM;
int c4_block_rem = c % C4NUM; int c4_block_rem = c % C4NUM;
const int8_t *src_c = origin_weight + c * unit; const int8_t *src_c = origin_weight + c * plane;
int16_t *dst_c = packed_weight_ + c4_block_num * unit * C4NUM; int16_t *dst_c = packed_weight_ + c4_block_num * plane * C4NUM;
for (int k = 0; k < unit; k++) { for (int k = 0; k < plane; k++) {
const int8_t *src_kernel = src_c + k; const int8_t *src_kernel = src_c + k;
int16_t *dst_kernel = dst_c + C4NUM * k + c4_block_rem; int16_t *dst_kernel = dst_c + C4NUM * k + c4_block_rem;
*dst_kernel = (int16_t)(src_kernel[0] - weight_zp); *dst_kernel = (int16_t)(src_kernel[0] - weight_zp);
......
...@@ -100,7 +100,8 @@ void PackNCHWToNHWCInt8(const void *src, void *dst, int batch, int plane, int ch ...@@ -100,7 +100,8 @@ void PackNCHWToNHWCInt8(const void *src, void *dst, int batch, int plane, int ch
void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter *conv_param); void PackDepthwiseInt8Input(const int8_t *src, int16_t *dst, const ConvParameter *conv_param);
void PackDepthwiseInt8Weight(const int8_t *src, int16_t *dst, const ConvParameter *conv_param); void PackDepthwiseInt8Weight(const int8_t *origin_weight, int16_t *packed_weight_, int plane, int channel,
ConvQuantArg *quant_qrg);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册