diff --git a/mindspore/lite/src/populate_parameter.cc b/mindspore/lite/src/populate_parameter.cc
index 3ce3900ae9566cc5422e2e7c4c38b6fb518a87ea..4ecf73e5cf99e9749327d893c7ed0d251785f847 100644
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -40,7 +40,6 @@
 #include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/activation.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/arithmetic.h"
-#include "src/runtime/kernel/arm/nnacl/fused_batchnorm.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
 #include "src/runtime/kernel/arm/nnacl/power.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/range.h"
@@ -510,15 +509,15 @@ OpParameter *PopulateActivationParameter(const lite::Primitive *primitive) {
 }
 
 OpParameter *PopulateFusedBatchNorm(const lite::Primitive *primitive) {
-  FusedBatchNormParameter *fuse_batch_norm_param = new (std::nothrow) FusedBatchNormParameter();
-  if (fuse_batch_norm_param == nullptr) {
+  BatchNormParameter *batch_norm_param = new (std::nothrow) BatchNormParameter();
+  if (batch_norm_param == nullptr) {
     MS_LOG(ERROR) << "new FusedBatchNormParameter failed.";
     return nullptr;
   }
-  fuse_batch_norm_param->op_parameter_.type_ = primitive->Type();
+  batch_norm_param->op_parameter_.type_ = primitive->Type();
   auto param = primitive->Value()->value_as_FusedBatchNorm();
-  fuse_batch_norm_param->epsilon_ = param->epsilon();
-  return reinterpret_cast<OpParameter *>(fuse_batch_norm_param);
+  batch_norm_param->epsilon_ = param->epsilon();
+  return reinterpret_cast<OpParameter *>(batch_norm_param);
 }
 
 OpParameter *PopulateArithmetic(const lite::Primitive *primitive) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index 522dd49d90f935fce3652a4abdfeabaf07c2c966..c288550e86622c4b5c8f0db6334a8110d9877224 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -28,6 +28,22 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
+ConvolutionDepthwiseFp16CPUKernel::~ConvolutionDepthwiseFp16CPUKernel() {
+  delete sliding_;
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
+  if (packed_output_ != nullptr) {
+    delete packed_output_;
+    packed_output_ = nullptr;
+  }
+}
+
 int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
   // malloc pack input buffer
   int C8 = UP_DIV(conv_param_->input_channel_, C8NUM);
@@ -113,8 +129,14 @@ int ConvolutionDepthwiseFp16CPUKernel::Init() {
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
-  free(packed_input_);
-  free(packed_output_);
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
+  if (packed_output_ != nullptr) {
+    delete packed_output_;
+    packed_output_ = nullptr;
+  }
 
   ConvolutionBaseCPUKernel::Init();
   InitSlidingParam(sliding_, conv_param_, C8NUM);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
index d605ca5ba0bec17dc6d1ca46eeb5cabee2ffb3aa..de7f2cce2962988945921471b5365268d3e86e37 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
@@ -29,12 +29,7 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
                                     const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                     const lite::Primitive *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ConvolutionDepthwiseFp16CPUKernel() override {
-    delete sliding_;
-    free(packed_weight_);
-    free(packed_input_);
-    free(packed_output_);
-  }
+  ~ConvolutionDepthwiseFp16CPUKernel() override;
 
   int Init() override;
   int ReSize() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index 0df66a963cba032cd08ec86f7ff5c7c1b8837536..96aa37c63e62c6ff5b0aa347f3f7323f4aa26e32 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -28,6 +28,22 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
 
 namespace mindspore::kernel {
+DeconvolutionDepthwiseFp16CPUKernel::~DeconvolutionDepthwiseFp16CPUKernel() {
+  delete sliding_;
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
+  if (packed_output_ != nullptr) {
+    delete packed_output_;
+    packed_output_ = nullptr;
+  }
+}
+
 int DeconvolutionDepthwiseFp16CPUKernel::InitSlideParam() {
   conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N);
   conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H);
@@ -126,8 +142,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Init() {
 }
 
 int DeconvolutionDepthwiseFp16CPUKernel::ReSize() {
-  free(packed_input_);
-  free(packed_output_);
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
+  if (packed_output_ != nullptr) {
+    delete packed_output_;
+    packed_output_ = nullptr;
+  }
 
   InitSlideParam();
   ConvolutionBaseCPUKernel::Init();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
index 64807fa9d898ac0dde52b50ae2a6ea38cb8d1ab6..be88809971af1035f72ae5281e1e3fce98ec6217 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
                                       const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                       const lite::Primitive *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~DeconvolutionDepthwiseFp16CPUKernel() override {
-    delete sliding_;
-    free(packed_weight_);
-    if (need_align_) {
-      free(packed_input_);
-      free(packed_output_);
-    }
-  };
+  ~DeconvolutionDepthwiseFp16CPUKernel() override;
 
   int Init() override;
   int ReSize() override;
@@ -52,7 +45,6 @@ class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
   float16_t *packed_weight_;
   float16_t *packed_input_;
   float16_t *packed_output_;
-  bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
index 4a827a78ee61107c5965dd56981977deca0d62c3..157c4b76c1cfccb995fe87387462898afd31dfe4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
@@ -15,7 +15,6 @@
  */
 
 #include "src/runtime/kernel/arm/fp32/batchnorm.h"
-#include <cmath>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -28,7 +27,42 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_BatchNorm;
 
 namespace mindspore::kernel {
+BatchnormCPUKernel::~BatchnormCPUKernel() {
+  if (mean_addr_ != nullptr) {
+    free(mean_addr_);
+    mean_addr_ = nullptr;
+  }
+  if (var_addr_ != nullptr) {
+    free(var_addr_);
+    var_addr_ = nullptr;
+  }
+}
+
+int BatchnormCPUKernel::InitConstTensor() {
+  auto mean = inputs_[1];
+  mean_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
+  if (mean_addr_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memcpy(mean_addr_, mean->Data(), mean->ElementsNum() * sizeof(float));
+
+  auto variance = inputs_[2];
+  var_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
+  if (var_addr_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memcpy(var_addr_, variance->Data(), variance->ElementsNum() * sizeof(float));
+  return RET_OK;
+}
+
 int BatchnormCPUKernel::Init() {
+  if (context_->infer_shape_interrupt_ && !context_->running_) {
+    SetNeedReInit();
+    return RET_OK;
+  }
+
   auto input_shapes = inputs_[0]->shape();
   auto n_dim = input_shapes.size();
   batchnorm_param_->channel_ = input_shapes[n_dim - 1];
@@ -37,11 +71,24 @@ int BatchnormCPUKernel::Init() {
     batchnorm_param_->unit_ *= input_shapes[i];
   }
   batchnorm_param_->op_parameter_.thread_num_ =
-    MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->unit_);
+    MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_);
+
+  auto ret = InitConstTensor();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Batchnorm fp32 InitConstTensor failed.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
-int BatchnormCPUKernel::ReSize() { return RET_OK; }
+int BatchnormCPUKernel::ReSize() {
+  auto input_shapes = inputs_[0]->shape();
+  batchnorm_param_->unit_ = 1;
+  for (int i = 0; i < input_shapes.size() - 1; i++) {
+    batchnorm_param_->unit_ *= input_shapes[i];
+  }
+  return RET_OK;
+}
 
 int BatchnormCPUKernel::DoExecute(int task_id) {
   BatchNorm(out_addr_, in_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_);
@@ -61,12 +108,10 @@ int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 int BatchnormCPUKernel::Run() {
   auto prepare_ret = Prepare();
   if (prepare_ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
+    MS_LOG(ERROR) << "Prepare fail! Ret error code: " << prepare_ret;
     return prepare_ret;
   }
   in_addr_ = reinterpret_cast<float *>(inputs_.at(0)->Data());
-  mean_addr_ = reinterpret_cast<float *>(inputs_.at(1)->Data());
-  var_addr_ = reinterpret_cast<float *>(inputs_.at(2)->Data());
   out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data());
 
   int ret = LiteBackendParallelLaunch(BatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
index 4ad0224511b16554a0f123053c8b210195d51f6f..28d9027cf81c311a241004e4638ea8f54ffb72c8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
@@ -31,14 +31,14 @@ class BatchnormCPUKernel : public LiteKernel {
                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                      const lite::Primitive *primitive)
       : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    opParameter->thread_num_ = ctx->thread_num_;
     batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter);
   }
-  ~BatchnormCPUKernel() override = default;
+  ~BatchnormCPUKernel() override;
 
   int Init() override;
   int ReSize() override;
   int Run() override;
+  int InitConstTensor();
   int DoExecute(int tid);
 
  private:
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
index 0de33adb567a4364566dd8f026848915eb94f19b..f9ca15665d6baa6f930e7e4a7179611ca53d7cc6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
@@ -29,6 +29,24 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
+ConvolutionDepthwiseCPUKernel::~ConvolutionDepthwiseCPUKernel() {
+  delete sliding_;
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  if (need_align_) {
+    if (packed_input_ != nullptr) {
+      delete packed_input_;
+      packed_input_ = nullptr;
+    }
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
+  }
+}
+
 int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = inputs_[kWeightIndex];
@@ -114,9 +132,16 @@ int ConvolutionDepthwiseCPUKernel::Init() {
 
 int ConvolutionDepthwiseCPUKernel::ReSize() {
   if (need_align_) {
-    free(packed_input_);
-    free(packed_output_);
+    if (packed_input_ != nullptr) {
+      delete packed_input_;
+      packed_input_ = nullptr;
+    }
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
   }
+
   // conv base init
   ConvolutionBaseCPUKernel::Init();
 
@@ -197,10 +222,11 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
   kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx, primitive);
   //  auto param = reinterpret_cast<ConvParameter *>(opParameter);
   //  if (param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && param->stride_w_ == 1 &&
-  //  param->dilation_h_ == 1 && param->dilation_w_ == 1) {
-  //    kernel = new (std::nothrow) kernel::ConvolutionDepthwise3x3CPUKernel(opParameter, inputs, outputs, ctx);
+  //      param->dilation_h_ == 1 && param->dilation_w_ == 1) {
+  //    kernel = new (std::nothrow) kernel::ConvolutionDepthwise3x3CPUKernel(opParameter, inputs, outputs, ctx,
+  //    primitive);
   //  } else {
-  //  kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx);
+  //    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseCPUKernel(opParameter, inputs, outputs, ctx, primitive);
   //  }
 
   if (kernel == nullptr) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
index 08706ac050ab48daebdc19b9fc41f4d081448ad6..22de529bcab8c7515bc2b41accc0f4318d8cf09c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
@@ -29,14 +29,7 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
                                 const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                 const lite::Primitive *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ConvolutionDepthwiseCPUKernel() override {
-    delete sliding_;
-    free(packed_weight_);
-    if (need_align_) {
-      free(packed_input_);
-      free(packed_output_);
-    }
-  };
+  ~ConvolutionDepthwiseCPUKernel() override;
 
   int Init() override;
   int ReSize() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
index a60851e17860f1e0a6731737bdd0a944225eb776..77cc8ac22e333d217b297e350a60f653f061a287 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
@@ -27,6 +27,24 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
 
 namespace mindspore::kernel {
+DeconvolutionDepthwiseCPUKernel::~DeconvolutionDepthwiseCPUKernel() {
+  delete sliding_;
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  if (need_align_) {
+    if (packed_input_ != nullptr) {
+      delete packed_input_;
+      packed_input_ = nullptr;
+    }
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
+  }
+}
+
 int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
   conv_param_->input_batch_ = outputs_.front()->shape().at(kNHWC_N);
   conv_param_->input_h_ = outputs_.front()->shape().at(kNHWC_H);
@@ -126,8 +144,14 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
 
 int DeconvolutionDepthwiseCPUKernel::ReSize() {
   if (need_align_) {
-    free(packed_input_);
-    free(packed_output_);
+    if (packed_input_ != nullptr) {
+      delete packed_input_;
+      packed_input_ = nullptr;
+    }
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
   }
   InitSlideParam();
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
index 0ad3c18d444f1684711d5a0d6dc7d78a030bb5a9..06400a3ba169e70c93af4cdbe95a4cafae515459 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
                                   const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                   const lite::Primitive *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~DeconvolutionDepthwiseCPUKernel() override {
-    delete sliding_;
-    free(packed_weight_);
-    if (need_align_) {
-      free(packed_input_);
-      free(packed_output_);
-    }
-  };
+  ~DeconvolutionDepthwiseCPUKernel() override;
 
   int Init() override;
   int InitSlideParam();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/flatten.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/flatten.cc
index 868705746b5c6c2c05c222e5ef4fbc429b585cf4..30e1ede1ed355e62f7dc5c4286c1e5693a3c7967 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/flatten.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/flatten.cc
@@ -32,6 +32,12 @@ int FlattenCPUKernel::Init() {
     SetNeedReInit();
     return RET_OK;
   }
+
+  ReSize();
+  return RET_OK;
+}
+
+int FlattenCPUKernel::ReSize() {
   auto output_shape = outputs_[0]->shape();
   flatten_param_->size = sizeof(float);
   for (int i = 0; i < output_shape.size(); i++) {
@@ -40,8 +46,6 @@ int FlattenCPUKernel::Init() {
   return RET_OK;
 }
 
-int FlattenCPUKernel::ReSize() { return RET_OK; }
-
 int FlattenCPUKernel::Run() {
   auto prepare_ret = Prepare();
   if (prepare_ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.cc
index 00810f89aaf825ba5c36462ddf41ae49b69fc82f..9f39c30eae70857e7789de8de88238256935a39a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.cc
@@ -15,10 +15,10 @@
  */
 
 #include "src/runtime/kernel/arm/fp32/fused_batchnorm.h"
-#include <cmath>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#include "src/runtime/runtime_api.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -27,33 +27,121 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_FusedBatchNorm;
 
 namespace mindspore::kernel {
+FusedBatchnormCPUKernel::~FusedBatchnormCPUKernel() {
+  if (scale_addr_ != nullptr) {
+    free(scale_addr_);
+    scale_addr_ = nullptr;
+  }
+  if (offset_addr_ != nullptr) {
+    free(offset_addr_);
+    offset_addr_ = nullptr;
+  }
+  if (mean_addr_ != nullptr) {
+    free(mean_addr_);
+    mean_addr_ = nullptr;
+  }
+  if (var_addr_ != nullptr) {
+    free(var_addr_);
+    var_addr_ = nullptr;
+  }
+}
+
+int FusedBatchnormCPUKernel::InitConstTensor() {
+  auto scale = inputs_[1];
+  scale_addr_ = reinterpret_cast<float *>(malloc(scale->ElementsNum() * sizeof(float)));
+  if (scale_addr_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memcpy(scale_addr_, scale->Data(), scale->ElementsNum() * sizeof(float));
+
+  auto offset = inputs_[2];
+  offset_addr_ = reinterpret_cast<float *>(malloc(offset->ElementsNum() * sizeof(float)));
+  if (offset_addr_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memcpy(offset_addr_, offset->Data(), offset->ElementsNum() * sizeof(float));
+
+  auto mean = inputs_[3];
+  mean_addr_ = reinterpret_cast<float *>(malloc(mean->ElementsNum() * sizeof(float)));
+  if (mean_addr_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memcpy(mean_addr_, mean->Data(), mean->ElementsNum() * sizeof(float));
+
+  auto variance = inputs_[4];
+  var_addr_ = reinterpret_cast<float *>(malloc(variance->ElementsNum() * sizeof(float)));
+  if (var_addr_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+  memcpy(var_addr_, variance->Data(), variance->ElementsNum() * sizeof(float));
+  return RET_OK;
+}
+
 int FusedBatchnormCPUKernel::Init() {
   if (context_->infer_shape_interrupt_ && !context_->running_) {
     SetNeedReInit();
     return RET_OK;
   }
-  input_shape_ = reinterpret_cast<int *>(malloc(sizeof(int) * inputs_[0]->shape().size()));
-  memcpy(input_shape_, inputs_[0]->shape().data(), inputs_[0]->shape().size() * sizeof(int));
+  auto input_shapes = inputs_[0]->shape();
+  auto n_dim = input_shapes.size();
+  batchnorm_param_->channel_ = input_shapes[n_dim - 1];
+  batchnorm_param_->unit_ = 1;
+  for (int i = 0; i < n_dim - 1; i++) {
+    batchnorm_param_->unit_ *= input_shapes[i];
+  }
+  batchnorm_param_->op_parameter_.thread_num_ =
+    MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_);
+
+  auto ret = InitConstTensor();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "FusedBatchnorm fp32 InitConstTensor failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int FusedBatchnormCPUKernel::ReSize() {
+  auto input_shapes = inputs_[0]->shape();
+  batchnorm_param_->unit_ = 1;
+  for (int i = 0; i < input_shapes.size() - 1; i++) {
+    batchnorm_param_->unit_ *= input_shapes[i];
+  }
+  return RET_OK;
+}
+
+int FusedBatchnormCPUKernel::Execute(int task_id) {
+  FusedBatchNorm(out_addr_, in_addr_, scale_addr_, offset_addr_, mean_addr_, var_addr_, task_id, batchnorm_param_);
   return RET_OK;
 }
 
-int FusedBatchnormCPUKernel::ReSize() { return RET_OK; }
+int FusedBatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto g_kernel = reinterpret_cast<FusedBatchnormCPUKernel *>(cdata);
+  auto ret = g_kernel->Execute(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "FusedBatchnormRun error task_id[" << task_id << "] error_code[" << ret << "]";
+    return ret;
+  }
+  return RET_OK;
+}
 
 int FusedBatchnormCPUKernel::Run() {
   auto prepare_ret = Prepare();
   if (prepare_ret != RET_OK) {
-    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
+    MS_LOG(ERROR) << "Prepare fail! Ret error code: " << prepare_ret;
     return prepare_ret;
   }
-  auto input_addr = reinterpret_cast<float *>(inputs_.at(0)->Data());
-  auto scale_addr = reinterpret_cast<float *>(inputs_.at(1)->Data());
-  auto offest_addr = reinterpret_cast<float *>(inputs_.at(2)->Data());
-  auto mean_addr = reinterpret_cast<float *>(inputs_.at(3)->Data());
-  auto variance_addr = reinterpret_cast<float *>(inputs_.at(4)->Data());
-  auto output_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
+  in_addr_ = reinterpret_cast<float *>(inputs_.at(0)->Data());
+  out_addr_ = reinterpret_cast<float *>(outputs_.at(0)->Data());
 
-  FusedBatchNorm(input_addr, scale_addr, offest_addr, mean_addr, variance_addr, input_shape_,
-                 fused_batchnorm_param_->epsilon_, output_addr);
+  int ret = LiteBackendParallelLaunch(FusedBatchNormRun, this, batchnorm_param_->op_parameter_.thread_num_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "FusedBatchnormRun error error_code[" << ret << "]";
+    return ret;
+  }
   return RET_OK;
 }
 
@@ -63,8 +151,8 @@ kernel::LiteKernel *CpuFusedBatchnormKernelCreator(const std::vector<lite::tenso
                                                    const kernel::KernelKey &desc, const lite::Primitive *primitive) {
   MS_ASSERT(opParameter != nullptr);
   MS_ASSERT(desc.type == schema::PrimitiveType_FusedBatchNorm);
-  FusedBatchnormCPUKernel *kernel = new (std::nothrow) FusedBatchnormCPUKernel(opParameter, inputs, outputs, ctx,
-                                                                               primitive);
+  FusedBatchnormCPUKernel *kernel =
+    new (std::nothrow) FusedBatchnormCPUKernel(opParameter, inputs, outputs, ctx, primitive);
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "new FusedBatchnormCPUKernel fail!";
     return nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.h
index 55c4ba2bb7d34fe6842243fe212c62a67d7c66db..a8b371874bb5e400e7aa37254e4c612124510d97 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/nnacl/fused_batchnorm.h"
+#include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
 
 namespace mindspore::kernel {
 class FusedBatchnormCPUKernel : public LiteKernel {
@@ -28,17 +28,26 @@ class FusedBatchnormCPUKernel : public LiteKernel {
                           const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                           const lite::Primitive *primitive)
       : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    fused_batchnorm_param_ = reinterpret_cast<FusedBatchNormParameter *>(parameter);
+    batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter);
   }
-  ~FusedBatchnormCPUKernel() override { delete fused_batchnorm_param_; }
+  ~FusedBatchnormCPUKernel() override;
 
   int Init() override;
   int ReSize() override;
   int Run() override;
 
+  int InitConstTensor();
+  int Execute(int task_id);
+
  private:
-  int *input_shape_{};
-  FusedBatchNormParameter *fused_batchnorm_param_;
+  float *in_addr_;
+  float *mean_addr_;
+  float *var_addr_;
+  float *scale_addr_;
+  float *offset_addr_;
+  float *out_addr_;
+
+  BatchNormParameter *batchnorm_param_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
index 69efb8a24e358cfaa1fb046f03ec354c8b962bcd..0bcf08d9d571826239ba9f91d80e84c150803e61 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
@@ -17,7 +17,6 @@
 #include "src/runtime/kernel/arm/fp32/scale.h"
 #include <string.h>
 #include <vector>
-#include "src/runtime/kernel/arm/nnacl/scale.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -29,23 +28,29 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Scale;
 
 namespace mindspore::kernel {
+ScaleCPUKernel::~ScaleCPUKernel() { FreeTmpBuffer(); }
+
 void ScaleCPUKernel::FreeTmpBuffer() {
-  if (scale_ != nullptr) {
-    free(scale_);
-    scale_ = nullptr;
+  if (scale_param_->const_scale_) {
+    if (scale_ != nullptr) {
+      free(scale_);
+      scale_ = nullptr;
+    }
   }
-  if (offset_ != nullptr) {
-    free(offset_);
-    offset_ = nullptr;
+  if (scale_param_->has_offset_) {
+    if (offset_ != nullptr) {
+      free(offset_);
+      offset_ = nullptr;
+    }
   }
 }
 
 int ScaleCPUKernel::InitScaleOffset() {
   FreeTmpBuffer();
-  auto param = reinterpret_cast<ScaleParameter *>(opParameter);
   auto scale_tensor = inputs_.at(1);
   float *scale_ptr = reinterpret_cast<float *>(inputs_.at(1)->Data());
   if (scale_ptr != nullptr) {
+    scale_param_->const_scale_ = true;
     scale_ = reinterpret_cast<float *>(malloc(scale_tensor->ElementsNum() * sizeof(float)));
     if (scale_ == nullptr) {
       MS_LOG(ERROR) << "Malloc buffer failed.";
@@ -53,6 +58,7 @@ int ScaleCPUKernel::InitScaleOffset() {
     }
     memcpy(scale_, scale_ptr, scale_tensor->ElementsNum() * sizeof(float));
   } else {
+    scale_param_->const_scale_ = false;
     scale_ = nullptr;
   }
 
@@ -64,40 +70,39 @@ int ScaleCPUKernel::InitScaleOffset() {
       return RET_ERROR;
     }
     memcpy(offset_, offset_tensor->Data(), offset_tensor->ElementsNum() * sizeof(float));
-    param->has_offset_ = true;
+    scale_param_->has_offset_ = true;
   } else {
     offset_ = nullptr;
-    param->has_offset_ = false;
+    scale_param_->has_offset_ = false;
   }
   return RET_OK;
 }
 
 int ScaleCPUKernel::InitParameter() {
-  auto param = reinterpret_cast<ScaleParameter *>(opParameter);
   auto in_tensor = inputs_.at(0);
   auto in_shape = in_tensor->shape();
   auto scale_tensor = inputs_.at(1);
   auto scale_shape = scale_tensor->shape();
 
-  if (scale_shape.size() + param->axis_ > in_shape.size()) {
+  if (scale_shape.size() + scale_param_->axis_ > in_shape.size()) {
     MS_LOG(ERROR) << "Scale tensor shape is incorrect.";
     return RET_ERROR;
   }
-  param->outer_size_ = 1;
-  param->axis_size_ = 1;
-  param->inner_size_ = 1;
-  for (int i = 0; i < param->axis_; i++) {
-    param->outer_size_ *= in_shape[i];
+  scale_param_->outer_size_ = 1;
+  scale_param_->axis_size_ = 1;
+  scale_param_->inner_size_ = 1;
+  for (int i = 0; i < scale_param_->axis_; i++) {
+    scale_param_->outer_size_ *= in_shape[i];
   }
   for (int i = 0; i < scale_shape.size(); i++) {
-    if (in_shape[i + param->axis_] != scale_shape[i]) {
+    if (in_shape[i + scale_param_->axis_] != scale_shape[i]) {
       MS_LOG(ERROR) << "Scale tensor shape is incorrect.";
       return RET_ERROR;
     }
-    param->axis_size_ *= in_shape[i + param->axis_];
+    scale_param_->axis_size_ *= in_shape[i + scale_param_->axis_];
   }
-  for (int i = param->axis_ + scale_shape.size(); i < in_shape.size(); i++) {
-    param->inner_size_ *= in_shape[i];
+  for (int i = scale_param_->axis_ + scale_shape.size(); i < in_shape.size(); i++) {
+    scale_param_->inner_size_ *= in_shape[i];
   }
   return RET_OK;
 }
@@ -130,9 +135,7 @@ int ScaleCPUKernel::ReSize() {
 }
 
 int ScaleCPUKernel::Scale(int task_id) {
-  auto ret =
-    DoScale(input_ptr_, output_ptr_, scale_, offset_, task_id, reinterpret_cast<ScaleParameter *>(opParameter));
-
+  auto ret = DoScale(input_ptr_, output_ptr_, scale_, offset_, task_id, scale_param_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error task_id[" << task_id << "] error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
index 4c19404d72f2f95bd33cd1d079fde280e4611f45..38ed5177468d19705abcc5dcb56c7407759901a2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.h
@@ -19,6 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/nnacl/scale.h"
 
 namespace mindspore::kernel {
 
@@ -27,10 +28,10 @@ class ScaleCPUKernel : public LiteKernel {
   ScaleCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                           const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                           const lite::Primitive *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ScaleCPUKernel() {
-    FreeTmpBuffer();
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
+    scale_param_ = reinterpret_cast<ScaleParameter *>(opParameter);
   }
+  ~ScaleCPUKernel() override;
 
   int Init() override;
   int ReSize() override;
@@ -45,6 +46,7 @@ class ScaleCPUKernel : public LiteKernel {
   float *scale_;
   float *offset_;
   float *output_ptr_;
+  ScaleParameter *scale_param_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
index bf107f65c34f6c6efd608df0f0eab8a268bb8545..52e4ab6e53c076a4da91d38497abac106875b3c2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -28,6 +28,24 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
+ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
+  delete sliding;
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
+  if (need_align_) {
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
+  }
+}
+
 int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   // init weight, int8 -> int16
   // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
@@ -111,10 +129,17 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
-  free(packed_input_);
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
   if (need_align_) {
-    free(packed_output_);
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
   }
+
   // conv base init
   ConvolutionBaseCPUKernel::Init();
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
index a6e068d90dbebf970ad25eed46357337df856989..e13ba163f05220e2bf0e538e58d6cfe9320a1233 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
@@ -29,14 +29,7 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
                                     const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                     const lite::Primitive *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ConvolutionDepthwiseInt8CPUKernel() override {
-    delete sliding;
-    free(packed_weight_);
-    free(packed_input_);
-    if (need_align_) {
-      free(packed_output_);
-    }
-  };
+  ~ConvolutionDepthwiseInt8CPUKernel() override;
 
   int Init() override;
   int ReSize() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
index 542898a08958a26d44320926268643999907cccf..b2c91baaa29110c60214db7424b486d8ca6f715c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
@@ -28,6 +28,28 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
 
 namespace mindspore::kernel {
+DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() {
+  delete sliding;
+  if (packed_weight_ != nullptr) {
+    delete packed_weight_;
+    packed_weight_ = nullptr;
+  }
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
+  if (need_align_) {
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
+  }
+  if (output_buffer_ != nullptr) {
+    delete output_buffer_;
+    output_buffer_ = nullptr;
+  }
+}
+
 int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   // init weight: int8 -> int16
   // o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
@@ -101,9 +123,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
   }
 
   // malloc tmp buffer for int32 output
-  output_buffer =
+  output_buffer_ =
     reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
-  if (output_buffer == nullptr) {
+  if (output_buffer_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
@@ -144,10 +166,21 @@ int DeconvolutionDepthwiseInt8CPUKernel::Init() {
 }
 
 int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
-  free(packed_input_);
+  if (packed_input_ != nullptr) {
+    delete packed_input_;
+    packed_input_ = nullptr;
+  }
   if (need_align_) {
-    free(packed_output_);
+    if (packed_output_ != nullptr) {
+      delete packed_output_;
+      packed_output_ = nullptr;
+    }
+  }
+  if (output_buffer_ != nullptr) {
+    delete output_buffer_;
+    output_buffer_ = nullptr;
   }
+
   InitSlideParam();
 
   // conv base init
@@ -162,7 +195,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
 }
 
 int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
-  DeconvDwInt8(packed_output_, output_buffer, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
+  DeconvDwInt8(packed_output_, output_buffer_, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
                conv_param_, sliding, task_id);
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
index d7b76438f3b82b3274f72266b1dc494febf67d35..3b7ac123e35769151d42c3ca38d852411052ed93 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
@@ -29,14 +29,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
                                       const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                       const lite::Primitive *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~DeconvolutionDepthwiseInt8CPUKernel() override {
-    delete sliding;
-    free(packed_weight_);
-    free(packed_input_);
-    if (need_align_) {
-      free(packed_output_);
-    }
-  };
+  ~DeconvolutionDepthwiseInt8CPUKernel() override;
 
   int Init() override;
   int ReSize() override;
@@ -52,7 +45,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
   int16_t *packed_weight_;
   int16_t *packed_input_;
   int8_t *packed_output_;
-  int32_t *output_buffer;
+  int32_t *output_buffer_;
   bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/flatten.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/flatten.h
index b2b2fdfebb32cd2e85305d6aff490a0e8b6fafa3..3d29b50f1af13eba8d2de28906391657bf027c71 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/flatten.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/flatten.h
@@ -24,4 +24,3 @@ typedef struct FlattenParameter {
 
 void Flatten(const void *input, void *output, FlattenParameter *flatten_param);
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FLATTEN_H_
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc
index 269528e0a498f06a9c943e1725ba3be5fddd5f3b..4e08156d3cea839095455eab2551482170756089 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.cc
@@ -19,10 +19,21 @@
 
 void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
                BatchNormParameter *param) {
-  for (int u = task_id; u < param->unit_; u += param->op_parameter_.thread_num_) {
-    for (int c = 0; c < param->channel_; c++) {
-      auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
+  for (int c = task_id; c < param->channel_; c += param->op_parameter_.thread_num_) {
+    auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
+    for (int u = 0; u < param->unit_; u++) {
       output_ptr[u * param->channel_ + c] = (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt;
     }
   }
 }
+
+void FusedBatchNorm(float *output_ptr, const float *input_ptr, const float *scale_ptr, const float *offest_ptr,
+                    const float *mean_ptr, const float *variance_ptr, int task_id, BatchNormParameter *param) {
+  for (int c = task_id; c < param->channel_; c += param->op_parameter_.thread_num_) {
+    auto variance_sqrt = sqrt(variance_ptr[c] + param->epsilon_);
+    for (int u = 0; u < param->unit_; u++) {
+      output_ptr[u * param->channel_ + c] =
+        (input_ptr[u * param->channel_ + c] - mean_ptr[c]) / variance_sqrt * scale_ptr[c] + offest_ptr[c];
+    }
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h
index c6103565e1fd7d5c3179aeb4f2ee770966428620..b4c187ba9a46dc43f656864cd95db10c820804d2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h
@@ -29,4 +29,7 @@ typedef struct BatchNormParameter {
 void BatchNorm(float *output_ptr, const float *input_ptr, const float *mean_ptr, const float *variance_ptr, int task_id,
                BatchNormParameter *param);
 
+void FusedBatchNorm(float *output_ptr, const float *input_ptr, const float *scale_ptr, const float *offest_ptr,
+                    const float *mean_ptr, const float *variance_ptr, int task_id, BatchNormParameter *param);
+
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.cc
index cd68bc467ef88922bf5fb49244b251be6fb0eaf2..3f543a1d38f3960fdd4bc02c1e8a792e9ff219fa 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/conv_depthwise.cc
@@ -486,6 +486,21 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia
   float32x4_t d10 = vaddq_f32(vaddq_f32(vaddq_f32(t10, t11), t12), bias_ptr);
   float32x4_t d11 = vaddq_f32(vsubq_f32(vsubq_f32(t11, t12), t13), bias_ptr);
 
+  float32x4_t zeros = {0, 0, 0, 0};
+  float32x4_t bounds = {6, 6, 6, 6};
+  if (is_relu) {
+    d00 = vmaxq_f32(d00, zeros);
+    d01 = vmaxq_f32(d01, zeros);
+    d10 = vmaxq_f32(d10, zeros);
+    d11 = vmaxq_f32(d11, zeros);
+  }
+  if (is_relu6) {
+    d00 = vminq_f32(vmaxq_f32(d00, zeros), bounds);
+    d01 = vminq_f32(vmaxq_f32(d01, zeros), bounds);
+    d10 = vminq_f32(vmaxq_f32(d10, zeros), bounds);
+    d11 = vminq_f32(vmaxq_f32(d11, zeros), bounds);
+  }
+
   vst1q_f32(dst_output, d00);
   if (w_in_range) {
     vst1q_f32(dst_output + channel, d01);
@@ -536,6 +551,19 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia
     float d10 = t10 + t11 + t12 + bias_ptr[0];
     float d11 = t11 - t12 - t13 + bias_ptr[0];
 
+    if (is_relu) {
+      d00 = MSMAX(d00, 0);
+      d01 = MSMAX(d01, 0);
+      d10 = MSMAX(d10, 0);
+      d11 = MSMAX(d11, 0);
+    }
+    if (is_relu6) {
+      d00 = MSMIN(MSMAX(d00, 0), 6);
+      d01 = MSMIN(MSMAX(d01, 0), 6);
+      d10 = MSMIN(MSMAX(d10, 0), 6);
+      d11 = MSMIN(MSMAX(d11, 0), 6);
+    }
+
     (dst_output + i)[0] = d00;
     if (w_in_range) {
       (dst_output + i + channel)[0] = d01;
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.cc b/mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.cc
deleted file mode 100644
index c740c9cdb1cefe76c351b8c4d98e0406d7086b9f..0000000000000000000000000000000000000000
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/fused_batchnorm.h"
-#include <math.h>
-
-void FusedBatchNorm(const float *input_ptr, const float *scale_ptr, const float *offest_ptr, const float *mean_ptr,
-                    const float *variance_ptr, int *input_shapes, float epsilon, float *output_ptr) {
-  int channel = input_shapes[3];
-  int units = 1;
-  for (int i = 0; i < 3; i++) {
-    units *= input_shapes[i];
-  }
-  for (int c = 0; c < input_shapes[3]; c++) {
-    auto variance_sqrt = sqrt(variance_ptr[c] + epsilon);
-    for (int u = 0; u < units; u++) {
-      output_ptr[u * channel + c] =
-        (input_ptr[u * channel + c] - mean_ptr[c]) / variance_sqrt * scale_ptr[c] + offest_ptr[c];
-    }
-  }
-}
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.h
deleted file mode 100644
index 259b967ac69b83642796c7b4f7cc708e3a558497..0000000000000000000000000000000000000000
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
-
-#include "nnacl/op_base.h"
-
-typedef struct FusedBatchNormParameter {
-    OpParameter op_parameter_;
-    float epsilon_;
-} FusedBatchNormParameter;
-
-void FusedBatchNorm(const float *input_ptr, const float *scale_ptr, const float *offest_ptr, const float *mean_ptr,
-                    const float *variance_ptr, int *input_shapes, float epsilon, float *output_ptr);
-
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FUSED_BATCHNORM_H_
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/scale.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/scale.h
index 0b1b733f5fa2e776f6387c976b811867ad4e8136..c4dee2a0daf144db73dfd5cba411a7b4bd3c79ff 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/scale.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/scale.h
@@ -25,10 +25,9 @@ typedef struct ScaleParameter {
   int axis_size_;
   int inner_size_;
   int axis_;
-  bool has_offset_;
-  // todo yangruoqi: axis
+  bool const_scale_ = false;
+  bool has_offset_ = false;
 } ScaleParameter;
 
 int DoScale(float *in_data, float *out_data, float *scale, float *offset, int task_id, ScaleParameter *scale_param);
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_SCALE_H_
-
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
index b65e6ecaabc62926c8f398cee11af932d83ff518..7bd03d842da44585d60728e11240763b100956ba 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batchnorm_fp32_tests.cc
@@ -17,33 +17,20 @@
 #include "mindspore/core/utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
-#include "mindspore/lite/src/runtime/kernel/arm/nnacl/fused_batchnorm.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/lite_kernel.h"
-#include "mindspore/lite/src/common/file_utils.h"
 
 namespace mindspore {
-
 class TestBatchnormFp32 : public mindspore::Common {
  public:
   TestBatchnormFp32() {}
 };
 
 TEST_F(TestBatchnormFp32, BNTest) {
-  std::vector<float> in_data = {0.0669681, 0.959215, 0.252686,  0.613594,  0.811776,  0.139469,  0.322848,  0.118354,
-                                0.082978,  0.399467, 0.961267,  0.0247456, 0.0714259, 0.0791484, 0.0648625, 0.561612,
-                                0.412069,  0.311492, 0.46109,   0.377125,  0.369283,  0.0332446, 0.696142,  0.715973,
-                                0.525524,  0.477265, 0.0336351, 0.751577,  0.377548,  0.964603,  0.0196834, 0.174865};
-  std::vector<float> in_data1 = {0.855446, 0.821765, 0.281008, 0.0798653, 0.22294,  0.793782, 0.963222, 0.17851,
-                                 0.667549, 0.274381, 0.592842, 0.216552,  0.190274, 0.237873, 0.610063, 0.307559,
-                                 0.830007, 0.760957, 0.583265, 0.763793,  0.456372, 0.391378, 0.547915, 0.862198,
-                                 0.510794, 0.826776, 0.515894, 0.30071,   0.404987, 0.184773};
-  std::vector<float> in_data2 = {0.712438, 0.4927,   0.078419, 0.310429, 0.546871, 0.0667141, 0.874321, 0.0265647,
-                                 0.685165, 0.732586, 0.952889, 0.506402, 0.540784, 0.131119,  0.357713, 0.678992,
-                                 0.960839, 0.340706, 0.697678, 0.398146, 0.313321, 0.6485,    0.739153, 0.00190134,
-                                 0.536842, 0.996873, 0.445276, 0.371212, 0.420397, 0.0930115};
-  std::vector<float> in_data3(32, 1);
-  std::vector<float> in_data4(32, 0);
+  std::vector<float> in_data = {-11.18675,  11.433986,  11.386012, 11.245945,   -2.7614849, 14.692399,
+                                -1.1983503, -6.6790967, 6.383416,  -13.3213005, -8.693595,  9.476344};
+  std::vector<float> in_data1 = {12.352293, 5.122387, 14.249514};
+  std::vector<float> in_data2 = {14.632595, 0.70900035, 11.179003};
   std::vector<lite::tensor::Tensor *> inputs_tensor;
   std::vector<lite::tensor::Tensor *> outputs_tensor;
 
@@ -51,8 +38,7 @@ TEST_F(TestBatchnormFp32, BNTest) {
   op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm;
   op_param.epsilon_ = 0.001f;
 
-  std::vector<int> in_shape = {1, 2, 4, 4};
-
+  std::vector<int> shape = {1, 2, 2, 3};
   lite::tensor::Tensor input0_tensor;
   lite::tensor::Tensor input1_tensor;
   lite::tensor::Tensor input2_tensor;
@@ -62,39 +48,40 @@ TEST_F(TestBatchnormFp32, BNTest) {
   input0_tensor.SetData(in_data.data());
   input1_tensor.SetData(in_data1.data());
   input2_tensor.SetData(in_data2.data());
-  input0_tensor.set_shape(in_shape);
+  input0_tensor.set_shape(shape);
+  input1_tensor.set_shape({3});
+  input2_tensor.set_shape({3});
 
-  std::vector<float> output(32);
-  std::vector<float> corr_out(32);
-  std::vector<int> output_shape = {1, 2, 4, 4};
+  std::vector<float> output(12);
+  std::vector<float> corr_out = {-6.1533737, 7.4904885,  -0.8563998, -0.289212,  -9.356432,  0.13245535,
+                                 -3.5422924, -14.005781, -2.3525476, -6.7113695, -16.396551, -1.4275324};
 
   lite::tensor::Tensor output0_tensor;
   outputs_tensor.push_back(&output0_tensor);
   output0_tensor.SetData(output.data());
+  output0_tensor.set_shape(shape);
   kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_BatchNorm};
   auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
   ASSERT_NE(creator, nullptr);
   lite::Context ctx;
-  ctx.thread_num_ = 7;
+  ctx.thread_num_ = 1;
   kernel::LiteKernel *kernel =
     creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc, nullptr);
   ASSERT_NE(kernel, nullptr);
   auto output_tensor_shape = output0_tensor.shape();
   kernel->Run();
 
-  FusedBatchNorm(in_data.data(), in_data3.data(), in_data4.data(), in_data1.data(), in_data2.data(), in_shape.data(),
-                 0.001f, corr_out.data());
-
   printf("==================output data=================\n");
-  for (int i = 0; i < 1 * 28; i++) {
+  for (int i = 0; i < output0_tensor.ElementsNum(); i++) {
     std::cout << output[i] << " ,";
   }
   std::cout << std::endl;
-  CompareOutputData(output.data(), corr_out.data(), 32, 0.00001);
+  CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001);
 
   input0_tensor.SetData(nullptr);
   input1_tensor.SetData(nullptr);
   input2_tensor.SetData(nullptr);
   output0_tensor.SetData(nullptr);
+  MS_LOG(INFO) << "TestBathNormFp32 accuracy passed";
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_0.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_0.bin
deleted file mode 100644
index b22edaef0e7683cd56ace393a1dcd5c8b061c979..0000000000000000000000000000000000000000
Binary files a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_0.bin and /dev/null differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_1.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_1.bin
deleted file mode 100644
index 437a6958ad7efb877634c6048fd294ab8a2426a7..0000000000000000000000000000000000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_1.bin
+++ /dev/null
@@ -1 +0,0 @@
-�L[?-"R>�q�>{B�>�?yx?��_>JSD>G�0?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_2.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_2.bin
deleted file mode 100644
index 4708330c95150b02913fb30726e2acf9f32047bb..0000000000000000000000000000000000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_2.bin
+++ /dev/null
@@ -1 +0,0 @@
-J[q?��P?���>g�?�A?>oo?7G?x�<��"?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_3.bin
deleted file mode 100644
index ca38daf512865dd08e8bae7f84bb584f57ed8672..0000000000000000000000000000000000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_3.bin
+++ /dev/null
@@ -1 +0,0 @@
-W�U>X�8?*�?!�v>��F>0�?.�<�C?�d?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_4.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_4.bin
deleted file mode 100644
index dd1fa36149bd64ff30f4e56954034d58ed336cec..0000000000000000000000000000000000000000
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_4.bin
+++ /dev/null
@@ -1 +0,0 @@
-�R?�]?��>�c~?um?z1->�??�'?�U?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_out.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_out.bin
deleted file mode 100644
index 9bc4e213954d4d9a897699b709331dd1cbc54c91..0000000000000000000000000000000000000000
Binary files a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_out.bin and /dev/null differ