提交 867afc71 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4157 [MS][LITE] fix bug of arm cpu fp32 op: batchnorm, scale, pooling

Merge pull request !4157 from yangruoqi713/lite
...@@ -56,13 +56,11 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso ...@@ -56,13 +56,11 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso
} else { } else {
auto round_mode = pooling_prim->roundMode(); auto round_mode = pooling_prim->roundMode();
if (round_mode == schema::RoundMode_FLOOR) { if (round_mode == schema::RoundMode_FLOOR) {
output_h = std::floor((input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH() + 1); output_h = std::floor(static_cast<float>(input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH()) + 1;
output_w = std::floor((input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW() + 1); output_w = std::floor(static_cast<float>(input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW()) + 1;
} else if (round_mode == schema::RoundMode_CEIL) { } else if (round_mode == schema::RoundMode_CEIL) {
output_h = output_h = std::ceil(static_cast<float>(input_h + pad_u_ + pad_d_ - window_h) / pooling_prim->strideH()) + 1;
std::ceil((input_h + pooling_prim->padUp() + pooling_prim->padDown() - window_h) / pooling_prim->strideH() + 1); output_w = std::ceil(static_cast<float>(input_w + pad_l_ + pad_r_ - window_w) / pooling_prim->strideW()) + 1;
output_w = std::ceil(
(input_w + pooling_prim->padLeft() + pooling_prim->padRight() - window_w) / pooling_prim->strideW() + 1);
} else { } else {
MS_LOG(ERROR) << "unsupported round mode."; MS_LOG(ERROR) << "unsupported round mode.";
} }
...@@ -80,4 +78,3 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso ...@@ -80,4 +78,3 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso
return RET_OK; return RET_OK;
} }
} // namespace mindspore::lite } // namespace mindspore::lite
...@@ -28,17 +28,23 @@ using mindspore::lite::RET_OK; ...@@ -28,17 +28,23 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_BatchNorm; using mindspore::schema::PrimitiveType_BatchNorm;
namespace mindspore::kernel { namespace mindspore::kernel {
int BatchnormCPUKernel::Init() { return RET_OK; } int BatchnormCPUKernel::Init() {
auto input_shapes = inputs_[0]->shape();
auto n_dim = input_shapes.size();
batchnorm_param_->channel_ = input_shapes[n_dim - 1];
batchnorm_param_->unit_ = 1;
for (int i = 0; i < n_dim - 1; i++) {
batchnorm_param_->unit_ *= input_shapes[i];
}
batchnorm_param_->op_parameter_.thread_num_ =
MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->unit_);
return RET_OK;
}
int BatchnormCPUKernel::ReSize() { return RET_OK; } int BatchnormCPUKernel::ReSize() { return RET_OK; }
int BatchnormCPUKernel::DoExecute(int tid) { int BatchnormCPUKernel::DoExecute(int task_id) {
int count = MSMIN(thread_unit_, units_ - tid * thread_unit_); BatchNorm(out_addr_, in_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_);
if (count <= 0) {
return RET_OK;
}
int offset = tid * thread_unit_ * channel_;
BatchNorm(in_addr_ + offset, mean_addr_, var_addr_, count, channel_, batchnorm_param_->epsilon_, out_addr_ + offset);
return RET_OK; return RET_OK;
} }
...@@ -62,15 +68,8 @@ int BatchnormCPUKernel::Run() { ...@@ -62,15 +68,8 @@ int BatchnormCPUKernel::Run() {
mean_addr_ = reinterpret_cast<float *>(inputs_.at(1)->Data()); mean_addr_ = reinterpret_cast<float *>(inputs_.at(1)->Data());
var_addr_ = reinterpret_cast<float *>(inputs_.at(2)->Data()); var_addr_ = reinterpret_cast<float *>(inputs_.at(2)->Data());
out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data()); out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data());
auto input_shapes = inputs_[0]->shape();
channel_ = input_shapes[3]; int ret = LiteBackendParallelLaunch(BatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_);
units_ = 1;
for (int i = 0; i < 3; i++) {
units_ *= input_shapes[i];
}
thread_count_ = MSMIN(thread_count_, units_);
thread_unit_ = UP_DIV(units_, thread_count_);
int ret = LiteBackendParallelLaunch(BatchNormRun, this, thread_count_);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]"; MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
return ret; return ret;
......
...@@ -30,10 +30,11 @@ class BatchnormCPUKernel : public LiteKernel { ...@@ -30,10 +30,11 @@ class BatchnormCPUKernel : public LiteKernel {
BatchnormCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, BatchnormCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx->thread_num_) { : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
opParameter->thread_num_ = ctx->thread_num_;
batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter); batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter);
} }
~BatchnormCPUKernel() override { delete batchnorm_param_; } ~BatchnormCPUKernel() override = default;
int Init() override; int Init() override;
int ReSize() override; int ReSize() override;
...@@ -41,15 +42,10 @@ class BatchnormCPUKernel : public LiteKernel { ...@@ -41,15 +42,10 @@ class BatchnormCPUKernel : public LiteKernel {
int DoExecute(int tid); int DoExecute(int tid);
private: private:
int thread_count_;
int thread_unit_;
int units_;
int channel_;
float *in_addr_; float *in_addr_;
float *mean_addr_; float *mean_addr_;
float *var_addr_; float *var_addr_;
float *out_addr_; float *out_addr_;
const Context *ctx_;
BatchNormParameter *batchnorm_param_; BatchNormParameter *batchnorm_param_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel
......
...@@ -36,8 +36,12 @@ int Nchw2NhwcCPUKernel::Run() { ...@@ -36,8 +36,12 @@ int Nchw2NhwcCPUKernel::Run() {
auto input = inputs_[0]; auto input = inputs_[0];
auto output = outputs_[0]; auto output = outputs_[0];
PackNCHWToNHWCFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), if (input->shape().size() == 4) {
output->Channel()); PackNCHWToNHWCFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(),
output->Channel());
} else {
memcpy(output->Data(), input->Data(), input->ElementsNum() * sizeof(float));
}
return RET_OK; return RET_OK;
} }
......
...@@ -36,8 +36,12 @@ int Nhwc2NchwCPUKernel::Run() { ...@@ -36,8 +36,12 @@ int Nhwc2NchwCPUKernel::Run() {
auto input = inputs_[0]; auto input = inputs_[0];
auto output = outputs_[0]; auto output = outputs_[0];
PackNHWCToNCHWFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(), if (input->shape().size() == 4) {
output->Channel()); PackNHWCToNCHWFp32(input->Data(), output->Data(), output->Batch(), output->Height() * output->Width(),
output->Channel());
} else {
memcpy(output->Data(), input->Data(), input->ElementsNum() * sizeof(float));
}
return RET_OK; return RET_OK;
} }
......
...@@ -45,12 +45,13 @@ int ScaleCPUKernel::InitScaleOffset() { ...@@ -45,12 +45,13 @@ int ScaleCPUKernel::InitScaleOffset() {
} }
if (inputs_.size() == 3) { if (inputs_.size() == 3) {
auto offset_tensor = inputs_.at(1); auto offset_tensor = inputs_.at(2);
offset_ = reinterpret_cast<float *>(malloc(offset_tensor->ElementsNum() * sizeof(float))); offset_ = reinterpret_cast<float *>(malloc(offset_tensor->ElementsNum() * sizeof(float)));
if (offset_ == nullptr) { if (offset_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed."; MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR; return RET_ERROR;
} }
memcpy(offset_, offset_tensor->Data(), offset_tensor->ElementsNum() * sizeof(float));
param->has_offset_ = true; param->has_offset_ = true;
} else { } else {
offset_ = nullptr; offset_ = nullptr;
......
...@@ -16,12 +16,12 @@ ...@@ -16,12 +16,12 @@
#include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h" #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
void BatchNorm(const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int units, int channel, void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
float epsilon, float *output_ptr) { BatchNormParameter *param) {
for (int u = 0; u < units; u++) { for (int u = task_id; u < param->unit_; u += param->op_parameter_.thread_num_) {
for (int c = 0; c < channel; c++) { for (int c = 0; c < param->channel_; c++) {
auto variance_sqrt = sqrt(variance_ptr[c] + epsilon); auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
output_ptr[u * channel + c] = (input_ptr[u * channel + c] - mean_ptr[c]) / variance_sqrt; output_ptr[u * param->channel_ + c] = (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt;
} }
} }
} }
...@@ -22,9 +22,11 @@ ...@@ -22,9 +22,11 @@
struct BatchNormParameter { struct BatchNormParameter {
OpParameter op_parameter_; OpParameter op_parameter_;
float epsilon_; float epsilon_;
int unit_;
int channel_;
}; };
void BatchNorm(const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int count, int channel, void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
float epsilon, float *output_ptr); BatchNormParameter *param);
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
...@@ -245,8 +245,6 @@ bool ThreadPool::SetThreadPool() { ...@@ -245,8 +245,6 @@ bool ThreadPool::SetThreadPool() {
} else { } else {
AddRunThread(localMaxThreadNums); AddRunThread(localMaxThreadNums);
} }
MS_LOG(DEBUG) << "configThreadNums=" << configThreadNums << ", curThreadNums=" << curThreadNums
<< ", curThreadRunNums=" << curThreadRunNums << ", localMaxThreadNums=" << localMaxThreadNums;
return true; return true;
} }
...@@ -276,7 +274,6 @@ void ThreadPool::AddNewThread(int newNums) { ...@@ -276,7 +274,6 @@ void ThreadPool::AddNewThread(int newNums) {
} }
curThreadNums += newNums; curThreadNums += newNums;
curThreadRunNums += newNums; curThreadRunNums += newNums;
MS_LOG(DEBUG) << "add " << newNums << " thread";
} }
bool ThreadPool::SetThreadCpuBind(bool ifBind, int mode, bool master) { bool ThreadPool::SetThreadCpuBind(bool ifBind, int mode, bool master) {
...@@ -330,7 +327,6 @@ bool ThreadPool::AddTask(WorkFun &&worker, void *cdata, int numTask) { ...@@ -330,7 +327,6 @@ bool ThreadPool::AddTask(WorkFun &&worker, void *cdata, int numTask) {
} }
bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) { bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) {
MS_LOG(DEBUG) << "numTask = " << numTask << ", curThreadRunNums = " << curThreadRunNums;
auto taskOri = *task; auto taskOri = *task;
if (numTask > curThreadRunNums) { if (numTask > curThreadRunNums) {
task->first = [taskOri, numTask, this](int task_id, TvmEnv *penv, void *cdata) -> int { task->first = [taskOri, numTask, this](int task_id, TvmEnv *penv, void *cdata) -> int {
...@@ -370,12 +366,10 @@ bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) { ...@@ -370,12 +366,10 @@ bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) {
} }
} }
} }
MS_LOG(DEBUG) << "finish " << numTask << " task successful";
return CheckResult(); return CheckResult();
} }
void ThreadPool::AddRunThread(int num) { void ThreadPool::AddRunThread(int num) {
MS_LOG(DEBUG) << "num=" << num << ", curThreadRunNums=" << curThreadRunNums;
int activeNums = num - curThreadRunNums; int activeNums = num - curThreadRunNums;
if (activeNums <= 0 || activateList.size() < activeNums) { if (activeNums <= 0 || activateList.size() < activeNums) {
return; return;
...@@ -389,7 +383,6 @@ void ThreadPool::AddRunThread(int num) { ...@@ -389,7 +383,6 @@ void ThreadPool::AddRunThread(int num) {
} }
void ThreadPool::SubRunThread(int num) { void ThreadPool::SubRunThread(int num) {
MS_LOG(DEBUG) << "num=" << num << ", curThreadRunNums=" << curThreadRunNums;
int deactiveNums = curThreadRunNums - num; int deactiveNums = curThreadRunNums - num;
if (deactiveNums <= 0) { if (deactiveNums <= 0) {
return; return;
......
...@@ -56,6 +56,8 @@ STATUS CaffePoolingParser::Parse(const caffe::LayerParameter &proto, ...@@ -56,6 +56,8 @@ STATUS CaffePoolingParser::Parse(const caffe::LayerParameter &proto,
return RET_ERROR; return RET_ERROR;
} }
// default roundMode RoundMode_CEIL
attr->roundMode = schema::RoundMode_CEIL;
if (poolingParam.has_round_mode()) { if (poolingParam.has_round_mode()) {
if (poolingParam.round_mode() == caffe::PoolingParameter_RoundMode_FLOOR) { if (poolingParam.round_mode() == caffe::PoolingParameter_RoundMode_FLOOR) {
attr->roundMode = schema::RoundMode_FLOOR; attr->roundMode = schema::RoundMode_FLOOR;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册