diff --git a/mindspore/lite/schema/model.fbs b/mindspore/lite/schema/model.fbs
index b71fa966c125c70e45793c1e9e3a22ed8ccdb091..7cc9e082b4b25ced0c4af20d7efc9ff991cbd482 100644
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@@ -174,6 +174,19 @@ union PrimitiveType {
     Where,
     OneHot,
     Lstm,
+    Conv2DGradFilter,
+    Conv2DGradInput,
+    PoolingGrad,
+    BNGradInput,
+    OptMomentum,
+    BiasGrad,
+    SoftmaxCrossEntropy,
+    AddGrad,
+    SubGrad,
+    MulGrad,
+    DivGrad,
+    PowerGrad,
+    ActivationGrad,
     PriorBox
 }
 
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index 52fb99aabe734a4c12e86b2cfb9e0d2df7c065e9..a78b43b34963471571843cd0e95493f83a57dd75 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -55,7 +55,25 @@ enum ActivationType : byte {
     LINEAR = 15,
     UNKNOW = 16
 }
-
+enum ActivationGradType : byte {
+    NO_ACTIVATION = 0,
+    RELU = 1,
+    SIGMOID = 2,
+    RELU6 = 3,
+    ELU = 4,
+    LEAKY_RELU = 5,
+    ABS = 6,
+    RELU1 = 7,
+    SOFTSIGN = 8,
+    SOFTPLUS = 9,
+    TANH = 10,
+    SELU = 11,
+    HSWISH = 12,
+    HSIGMOID = 13,
+    THRESHOLDRELU = 14,
+    LINEAR = 15,
+    UNKNOW = 16
+}
 enum ReduceType : byte {
     REDUCE_MAX = 0,
     REDUCE_MEAN = 1,
@@ -125,6 +143,10 @@ table SoftMax {
 table Activation {
     type: ActivationType = 0;
 }
+table ActivationGrad {
+    type: ActivationGradType = 0;
+}
+
 
 table Conv2D {
     format: Format = 0;
@@ -146,7 +168,45 @@ table Conv2D {
     activationType: ActivationType = 0;
 }
 
-table FusedBatchNorm {
+table Conv2DGradFilter {
+    format: Format = 0;
+    group: int;
+    channelIn: int;
+    channelOut: int;
+    kernelW: int;
+    kernelH: int;
+    strideW: int;
+    strideH: int;
+    padMode: PadMode;
+    padUp: int;
+    padDown: int;
+    padLeft: int;
+    padRight: int;
+    dilateW: int;
+    dilateH: int;
+    hasBias: bool = false;
+    activationType: ActivationType = 0;
+}
+
+table Conv2DGradInput {
+    format: Format = 0;
+    group: int;
+    channelIn: int;
+    channelOut: int;
+    kernelW: int;
+    kernelH: int;
+    strideW: int;
+    strideH: int;
+    padMode: PadMode;
+    padUp: int;
+    padDown: int;
+    padLeft: int;
+    padRight: int;
+    dilateW: int;
+    dilateH: int;
+    hasBias: bool = false;
+    activationType: ActivationType = 0;
+}table FusedBatchNorm {
     epsilon: float = 0.00001;   // eg. epsilon=0.001
     momentum: float = 0.9;
     spatial: int = 1;
@@ -156,6 +216,31 @@ table CaffeBatchNorm {
     epsilon: float;   // eg. epsilon=0.001
 }
 
+table BiasGrad {
+    axis: [int];
+}
+
+
+table SoftmaxCrossEntropy {
+    axis: [int];
+}
+
+
+table PoolingGrad {
+    format: Format = 0;
+    poolingMode: PoolMode;
+    global: bool = false;
+    windowW: int;
+    windowH: int;
+    strideW: int;
+    strideH: int;
+    padMode: PadMode;
+    padUp: int;
+    padDown: int;
+    padLeft: int;
+    padRight: int;
+    roundMode: RoundMode;
+}
 table Shape {
 }
 
@@ -286,7 +371,10 @@ table DeConv2D {
     hasBias: bool = false;
     activationType: ActivationType = 0;
 }
-
+table BNGradInput {
+    eps : float;
+    channels: int;
+}
 table Scale {
     format: Format = 0;
 }
@@ -307,6 +395,17 @@ table Mul {
 table Div {
 }
 
+table AddGrad {
+}
+
+table SubGrad {
+}
+
+table MulGrad {
+}
+
+table DivGrad {
+}
 table RealDiv {
 }
 
@@ -389,7 +488,11 @@ table Power {
     scale: float;
     shift: float;
 }
-
+table PowerGrad {
+    power: float;
+    scale: float;
+    shift: float;
+}
 table ArgMax {
     axis: int;
     outMaxValue: bool;
@@ -712,6 +815,10 @@ table SquaredDifference {
 table TupleGetItem {
 }
 
+table OptMomentum {
+}
+
+
 table Where{
 }
 
diff --git a/mindspore/lite/src/common/file_utils_ext.cc b/mindspore/lite/src/common/file_utils_ext.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cdaa337e23da0631bf31f6649d44df0f9c3f464c
--- /dev/null
+++ b/mindspore/lite/src/common/file_utils_ext.cc
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cmath>
+#include <cstddef>
+#include <iostream>
+#include "src/common/file_utils.h"
+#include "src/common/file_utils_ext.h"
+
+namespace mindspore {
+namespace lite {
+static int CompareOutputRelativeData(float *output_data, float *correct_data, int data_size) {
+  float error = 0;
+
+  // relative error
+  float diffSum = 0.0f;
+  float sum = 0.0f;
+  for (int i = 0; i < data_size; i++) {
+    sum += std::abs(correct_data[i]);
+  }
+  for (int i = 0; i < data_size; i++) {
+    float diff = std::abs(output_data[i] - correct_data[i]);
+    diffSum += diff;
+  }
+  error = diffSum / sum;
+  if (error > 1e-4) {
+    std::cout << "has accuracy error!\n" << error << "\n";
+    return 1;
+  }
+  return 0;
+}
+
+int CompareRelativeOutput(float *output_data, std::string file_path) {
+  size_t output_size;
+  auto ground_truth = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &output_size));
+  size_t output_num = output_size / sizeof(float);
+  std::cout << "output num : " << output_num << "\n";
+  return CompareOutputRelativeData(output_data, ground_truth, output_num);
+}
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/common/file_utils_ext.h b/mindspore/lite/src/common/file_utils_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..28eea02e41af566099edc51c83adc4c026f5d05e
--- /dev/null
+++ b/mindspore/lite/src/common/file_utils_ext.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_COMMON_FILE_UTILS_EXT_H_
+#define MINDSPORE_LITE_COMMON_FILE_UTILS_EXT_H_
+#include <string>
+
+
+namespace mindspore {
+namespace lite {
+int CompareRelativeOutput(float *output_data, std::string file_path);
+
+}
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_COMMON_FILE_UTILS_EXT_H_
diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h
index 7cd2cfa9f222718e323e083532742437fa91f13d..b9086ee675ec923f06bc89206ec524c9eb355d46 100644
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -64,7 +64,7 @@ class LiteKernel {
   LiteKernel() = default;
   explicit LiteKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                       const std::vector<lite::tensor::Tensor *> &outputs)
-      : opParameter(parameter), inputs_(inputs), outputs_(outputs) {
+      : opParameter(parameter), inputs_(inputs), outputs_(outputs), train_mode(false) {
     this->in_kernel_.clear();
     this->out_kernel_.clear();
   }
@@ -77,7 +77,10 @@ class LiteKernel {
   virtual int Run() { return -1; }
 
   std::string Name() { return this->name; }
-
+  virtual void train() { train_mode = true; }
+  virtual bool is_train() { return train_mode == true; }
+  virtual void eval() { train_mode = false; }
+  virtual bool is_eval() { return train_mode == false; }
   void set_name(const std::string &name) { this->name = name; }
 
   schema::PrimitiveType type() { return (schema::PrimitiveType)this->opParameter->type_; }
@@ -117,6 +120,7 @@ class LiteKernel {
   std::vector<lite::tensor::Tensor *> outputs_;
   std::vector<LiteKernel *> in_kernel_;
   std::vector<LiteKernel *> out_kernel_;
+  bool train_mode;
 };
 
 class SubGraphKernel : public LiteKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..279832aca2fc8bf2f4cdb36cfc71a450d0336062
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.cc
@@ -0,0 +1,110 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp32/activation_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/runtime_api.h"
+#include "include/errorcode.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::ActivationGradType_HSWISH;
+using mindspore::schema::ActivationGradType_LEAKY_RELU;
+using mindspore::schema::ActivationGradType_RELU;
+using mindspore::schema::ActivationGradType_RELU6;
+using mindspore::schema::PrimitiveType_ActivationGrad;
+
+namespace mindspore::kernel {
+int ActivationGradCPUKernel::Init() {
+    outputs_[0]->set_shape(inputs_[0]->shape());
+    return RET_OK;
+}
+
+int ActivationGradCPUKernel::ReSize() { return RET_OK; }
+
+int ActivationGradCPUKernel::DoActivation(int task_id) {
+  auto yt_addr = reinterpret_cast<float *>(inputs_.at(0)->Data());
+  auto input_addr = reinterpret_cast<float *>(inputs_.at(1)->Data());
+  auto output_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
+  auto length = inputs_.at(0)->ElementsNum();
+
+  auto error_code = RET_OK;
+
+  if (type_ == schema::ActivationGradType_RELU) {
+    error_code = ReluGrad(yt_addr, input_addr, length, output_addr);
+  } else if (type_ == schema::ActivationGradType_RELU6) {
+    error_code = Relu6Grad(yt_addr, input_addr, length, output_addr);
+  } else if (type_ == schema::ActivationGradType_LEAKY_RELU) {
+    error_code = LReluGrad(yt_addr, input_addr, length, output_addr, alpha_);
+  } else if (type_ == schema::ActivationGradType_SIGMOID) {
+    error_code = SigmoidGrad(yt_addr, input_addr, length, output_addr);
+  } else if (type_ == schema::ActivationGradType_TANH) {
+    error_code = TanhGrad(yt_addr, input_addr, length, output_addr);
+  } else if (type_ == schema::ActivationGradType_HSWISH) {
+    error_code = HSwishGrad(yt_addr, input_addr, length, output_addr);
+    } else if (type_ == schema::ActivationGradType_HSIGMOID) {
+    error_code = HSigmoidGrad(yt_addr, input_addr, length, output_addr);
+  } else {
+    MS_LOG(ERROR) << "Activation type error";
+    return RET_ERROR;
+  }
+  if (error_code != RET_OK) {
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ActivationGradRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto activationGrad_kernel = reinterpret_cast<ActivationGradCPUKernel *>(cdata);
+  auto error_code = activationGrad_kernel->DoActivation(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "ActivationGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ActivationGradCPUKernel::Run() {
+  int error_code = LiteBackendParallelLaunch(ActivationGradRun, this, thread_count_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuActivationGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                      const std::vector<lite::tensor::Tensor *> &outputs,
+                                                      OpParameter *opParameter, const lite::Context *ctx,
+                                                      const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_ActivationGrad);
+  auto *kernel = new (std::nothrow) ActivationGradCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "InferShape kernel failed, name: " << opParameter->name_
+                  << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_ActivationGrad, CpuActivationGradFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fc4265d49fc49d17aee26b6a39d15fb36194799
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ACTIVATION_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ACTIVATION_GRAD_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+
+#include "src/runtime/kernel/arm/opclib/activation_grad.h"
+
+namespace mindspore::kernel {
+class ActivationGradCPUKernel : public LiteKernel {
+ public:
+  explicit ActivationGradCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
+                                   const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(param, inputs, outputs) {
+    ActivationGradParameter *param_act_grad = reinterpret_cast<ActivationGradParameter *>(param);
+    type_ = param_act_grad->type_;
+    alpha_ = param_act_grad->alpha_;
+  }
+  ~ActivationGradCPUKernel() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoActivation(int task_id);
+
+ private:
+  int thread_count_;
+  int type_;
+  float alpha_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ACTIVATION_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9d62c8ee9860c0307de9dbc0cf7b73e5ca7f699
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.cc
@@ -0,0 +1,285 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/arm/opclib/fp32/reduce_grad.h"
+#include "src/runtime/kernel/arm/fp32/arithmetic_grad.h"
+#include "src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+namespace {
+constexpr int kArithGradOpInputNum = 3;
+constexpr int kArithGradOpOutputNum = 2;
+}  // namespace
+
+int ArithmeticGradCPUKernel::Init() {
+  auto ret = InferShape();
+  return ret;
+}
+
+int ArithmeticGradCPUKernel::InferShape() {
+  if (inputs_.size() != kArithGradOpInputNum) {
+    MS_LOG(ERROR) << "The number of input must be " << kArithGradOpInputNum;
+    return RET_ERROR;
+  }
+  if (outputs_.size() != kArithGradOpOutputNum) {
+    MS_LOG(ERROR) << "The number of output must be " << kArithGradOpOutputNum;
+    return RET_ERROR;
+  }
+  auto dy = inputs_[0];
+  auto x1 = inputs_[1];
+  auto x2 = inputs_[2];
+  auto dx1 = outputs_[0];
+  auto dx2 = outputs_[1];
+
+  MS_ASSERT(dy != nullptr);
+  MS_ASSERT(x1 != nullptr);
+  MS_ASSERT(x2 != nullptr);
+  MS_ASSERT(dx1 != nullptr);
+  MS_ASSERT(dx2 != nullptr);
+
+  auto inShape0 = x1->shape();
+  auto inShape1 = x2->shape();
+  auto outShape = dy->shape();
+
+  if ((type() == PrimitiveType_AddGrad) || (type() == PrimitiveType_SubGrad)) {
+    arithmeticParameter_->ndim_ = outShape.size();
+    auto fillDimNum0 = outShape.size() - inShape0.size();
+    auto fillDimNum1 = outShape.size() - inShape1.size();
+    int j0 = 0;
+    int j1 = 0;
+    for (unsigned int i = 0; i < outShape.size(); i++) {
+      arithmeticParameter_->in_shape0_[i] = (i < fillDimNum0) ? 1 : inShape0[j0++];
+      arithmeticParameter_->in_shape1_[i] = (i < fillDimNum1) ? 1 : inShape1[j1++];
+      arithmeticParameter_->out_shape_[i] = outShape[i];
+    }
+  } else {
+    // if (inShape0.size() < inShape1.size())
+    if (dx1->ElementsNum() < dx2->ElementsNum()) {
+      arithmeticParameter_->ndim_ = inShape1.size();
+      if (type() == PrimitiveType_MulGrad)
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul2L;
+      else if (type() == PrimitiveType_DivGrad)
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv2L;
+
+      auto fillDimNum = inShape1.size() - inShape0.size();  // This will not work for batch!
+      int j = 0;
+      for (unsigned int i = 0; i < inShape1.size(); i++) {
+        if (i < fillDimNum) {
+          arithmeticParameter_->in_shape1_[i] = 1;
+        } else {
+          arithmeticParameter_->in_shape1_[i] = inShape0[j++];
+        }
+        arithmeticParameter_->in_shape0_[i] = inShape1[i];
+        arithmeticParameter_->out_shape_[i] = outShape[i];
+      }
+    } else if (dx2->ElementsNum() < dx1->ElementsNum()) {  // if (inShape0.size() > inShape1.size())
+      arithmeticParameter_->ndim_ = inShape0.size();
+      if (type() == PrimitiveType_MulGrad)
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul1L;
+      else if (type() == PrimitiveType_DivGrad)
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv1L;
+      arithmeticParameter_->broadcasting_ = true;
+      arithmeticParameter_->ndim_ = inShape0.size();
+      int j = 0;
+      auto fillDimNum = inShape0.size() - inShape1.size();
+      for (unsigned int i = 0; i < inShape0.size(); i++) {
+        if (i < fillDimNum) {
+          arithmeticParameter_->in_shape1_[i] = 1;
+        } else {
+          arithmeticParameter_->in_shape1_[i] = inShape1[j++];
+        }
+        arithmeticParameter_->in_shape0_[i] = inShape0[i];
+        arithmeticParameter_->out_shape_[i] = outShape[i];
+      }
+    } else {
+      arithmeticParameter_->broadcasting_ = false;
+      for (unsigned int i = 0; i < inShape0.size(); i++) {
+        arithmeticParameter_->in_shape1_[i] = inShape1[i];
+        arithmeticParameter_->in_shape0_[i] = inShape0[i];
+        arithmeticParameter_->out_shape_[i] = outShape[i];
+      }
+    }
+    tile_data0 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
+    MS_ASSERT(tile_data0 != nullptr);
+    tile_data1 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
+    MS_ASSERT(tile_data1 != nullptr);
+    if (type() == PrimitiveType_DivGrad) {
+      tile_data2 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
+      MS_ASSERT(tile_data2 != nullptr);
+    }
+  }
+
+  dx1->set_shape(x1->shape());
+  dx2->set_shape(x2->shape());
+  // outTensor->set_shape(out_shape);
+  dx1->set_data_type(dy->data_type());
+  dx2->set_data_type(dy->data_type());
+  return RET_OK;
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradAdd(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                int dx2_size) {
+  if (dx1_size == dy_size)
+    memcpy(dx1, dy, dy_size * sizeof(float));
+  else
+    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx1, arithmeticParameter_->in_shape0_,
+                    arithmeticParameter_->ndim_);
+  if (dx2_size == dy_size)
+    memcpy(dx2, dy, dy_size * sizeof(float));
+  else
+    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx2, arithmeticParameter_->in_shape1_,
+                    arithmeticParameter_->ndim_);
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradSub(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                int dx2_size) {
+  if (dx1_size == dy_size)
+    memcpy(dx1, dy, dy_size * sizeof(float));
+  else
+    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx1, arithmeticParameter_->in_shape0_,
+                    arithmeticParameter_->ndim_);
+  if (dx2_size == dy_size) {
+    for (int i = 0; i < dx2_size; i++) {
+      dx2[i] = -dy[i];
+    }
+  } else {
+    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx2, arithmeticParameter_->in_shape1_,
+                    arithmeticParameter_->ndim_);
+    for (int i = 0; i < dx2_size; i++) {
+      dx2[i] = -dx2[i];
+    }
+  }
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradMul(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                int dx2_size) {
+  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
+  ElementMul(dy, x1_data, dx2, dy_size);
+  ElementMul(dy, x2_data, dx1, dy_size);
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradMul1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                  int dx2_size) {
+  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
+  ElementMul(dy, x1_data, tile_data0, dy_size);
+  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx2, arithmeticParameter_->in_shape1_,
+                  arithmeticParameter_->ndim_);
+
+  BroadcastMul(dy, x2_data, tile_data0, tile_data1, dx1, dy_size, arithmeticParameter_);  // broadcast directly to dx1
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradMul2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                  int dx2_size) {
+  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
+  ElementMul(dy, x2_data, tile_data0, dy_size);
+  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx1, arithmeticParameter_->in_shape1_,
+                  arithmeticParameter_->ndim_);
+
+  BroadcastMul(dy, x1_data, tile_data0, tile_data1, dx2, dy_size, arithmeticParameter_);  // broadcast directly to dx2
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradDiv(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                int dx2_size) {
+  auto x1 = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto x2 = reinterpret_cast<float *>(inputs_[2]->Data());
+  ElementDiv(dy, x2, dx1, dy_size);
+  ElementMulAndDivNegSquare(dy, x1, x2, dx2, dy_size);
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradDiv1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                  int dx2_size) {
+  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
+
+  ElementMul(x2_data, x2_data, dx2, dx2_size);
+  ElementMul(x1_data, dy, dx1, dy_size);  // use dx1 buffer
+  BroadcastDiv(dx1, dx2, tile_data0, tile_data1, tile_data2, dy_size,
+               arithmeticParameter_);  // broadcast directly to dx1
+  ReduceSumByAxes(tile_data2, arithmeticParameter_->in_shape0_, dx2, arithmeticParameter_->in_shape1_,
+                  arithmeticParameter_->ndim_);
+  for (int i = 0; i < dx2_size; i++) dx2[i] = -dx2[i];
+  // ReduceNegSumPrefix(tile_data2, dy_size, dx2, dx2_size); //then reduce into dx2
+
+  // broadcasting x2
+  BroadcastDiv(dy, x2_data, tile_data0, tile_data1, dx1, dy_size, arithmeticParameter_);  // broadcast directly to dx1
+}
+
+void ArithmeticGradCPUKernel::ArithmeticGradDiv2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
+                                                  int dx2_size) {
+  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
+
+  // dx1 = dy/x2
+  ElementDiv(dy, x2_data, tile_data0, dy_size);  // first multiply into temp
+  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx1, arithmeticParameter_->in_shape1_,
+                  arithmeticParameter_->ndim_);
+
+  // dx2 = -dy*x1/(x2*x2)
+  BroadcastMul(dy, x1_data, tile_data0, tile_data1, tile_data2, dy_size, arithmeticParameter_);  // broadcast numerator
+  ElementDivNegSquare(tile_data2, x2_data, dx2, dy_size);
+}
+
+int ArithmeticGradCPUKernel::ReSize() { return RET_OK; }
+
+int ArithmeticGradCPUKernel::Run() {
+  auto dy = reinterpret_cast<float *>(inputs_[0]->Data());
+  // auto input1_data1 = reinterpret_cast<float *>(inputs_[1]->Data());
+  auto dx1 = reinterpret_cast<float *>(outputs_[0]->Data());
+  auto dx2 = reinterpret_cast<float *>(outputs_[1]->Data());
+
+  size_t dy_size = inputs_.at(0)->ElementsNum();
+  size_t dx1_size = outputs_.at(0)->ElementsNum();
+  size_t dx2_size = outputs_[1]->ElementsNum();
+  (this->*arithmetic_grad_)(dy, dy_size, dx1, dx1_size, dx2, dx2_size);
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuArithmeticGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                       const std::vector<lite::tensor::Tensor *> &outputs,
+                                                       OpParameter *opParameter, const lite::Context *ctx,
+                                                       const kernel::KernelKey &desc) {
+  MS_EXCEPTION_IF_NULL(opParameter);
+  if (opParameter == nullptr) {
+    return nullptr;
+  }
+  auto *kernel = new (std::nothrow) ArithmeticGradCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MulGrad, CpuArithmeticGradFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_AddGrad, CpuArithmeticGradFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SubGrad, CpuArithmeticGradFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DivGrad, CpuArithmeticGradFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..a11ef24a4488f892de2ab1ce82460289a202a18d
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.h
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_GRAD_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"
+#include "schema/model_generated.h"
+#include "ir/anf.h"
+
+using mindspore::schema::PrimitiveType_AddGrad;
+using mindspore::schema::PrimitiveType_DivGrad;
+using mindspore::schema::PrimitiveType_MulGrad;
+using mindspore::schema::PrimitiveType_SubGrad;
+
+namespace mindspore::kernel {
+
+class ArithmeticGradCPUKernel;
+
+class ArithmeticGradCPUKernel : public LiteKernel {
+  typedef void (ArithmeticGradCPUKernel::*ArithmeticGradOperation)(float *, int, float *, int, float *, int);
+
+ public:
+  explicit ArithmeticGradCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                   const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs), tile_data0(NULL), tile_data1(NULL), tile_data2(NULL) {
+    switch (type()) {
+      case PrimitiveType_MulGrad:
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul;  // this will be adjusted in InferShape
+        break;
+      case PrimitiveType_AddGrad:
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradAdd;
+        break;
+      case PrimitiveType_SubGrad:
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradSub;
+        break;
+      case PrimitiveType_DivGrad:
+        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv;  // this will be adjusted in InferShape
+        break;
+      default:
+        MS_LOG(ERROR) << "Error Operator type " << parameter->type_;
+        break;
+    }
+    arithmeticParameter_ = reinterpret_cast<ArithmeticParameter *>(parameter);
+  }
+  ~ArithmeticGradCPUKernel() override {
+    if (tile_data0) delete[] tile_data0;
+    if (tile_data1) delete[] tile_data1;
+    if (tile_data2) delete[] tile_data2;
+  }
+  void InitKernel(const CNodePtr &kernel_node);
+
+  int Init() override;
+  int InferShape();
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  void ArithmeticGradAdd(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  void ArithmeticGradSub(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  void ArithmeticGradMul(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  void ArithmeticGradMul1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  void ArithmeticGradMul2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  void ArithmeticGradDiv(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  void ArithmeticGradDiv1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  void ArithmeticGradDiv2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
+  ArithmeticParameter *arithmeticParameter_;
+  ArithmeticGradOperation arithmetic_grad_;
+  float *tile_data0;
+  float *tile_data1;
+  float *tile_data2;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e57fe298ab9222ba0d99c8e0ce6168f769413348
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.cc
@@ -0,0 +1,115 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+#include "src/runtime/kernel/arm/fp32/bias_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::schema::PrimitiveType_BiasGrad;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+int BiasGradCPUKernel::InferShape() {
+  if (1 != this->inputs_.size()) {
+    MS_LOG(ERROR) << "BiasGrad should have one input";
+    return RET_ERROR;
+  }
+  if (1 != this->outputs_.size()) {
+    MS_LOG(ERROR) << "BiasGrad should have one output";
+    return RET_ERROR;
+  }
+  auto *in0 = inputs_.front();
+  auto *out = outputs_.front();
+  MS_ASSERT(in0 != nullptr);
+  MS_ASSERT(out != nullptr);
+  auto inshape = in0->shape();
+  int ndim = inshape.size();
+  for (int i = 0; i < ndim - 1; i++) {
+    inshape[i] = 1;
+  }
+  out->set_shape(inshape);
+  out->set_data_type(in0->data_type());
+  return RET_OK;
+}
+
+int BiasGradCPUKernel::Init() {
+  MS_ASSERT(InferShape() == RET_OK);
+
+  auto dims = inputs_[0]->shape();
+  bias_param->ndim_ = dims.size();
+  for (unsigned int i = 0; i < bias_param->ndim_; i++) {
+    bias_param->in_shape0_[i] = dims[i];
+    bias_param->out_shape_[i] = 1;  // 1 dimension for N,H,W,
+  }
+  bias_param->out_shape_[bias_param->ndim_ - 1] = dims[bias_param->ndim_ - 1];
+  for (int i = bias_param->ndim_; i < 4; i++) {
+    bias_param->in_shape0_[i] = 0;
+    bias_param->out_shape_[i] = 0;
+  }
+  return RET_OK;
+}
+
+
+int BiasGradCPUKernel::ReSize() { return 0; }
+
+int BiasGradCPUKernel::Run() {
+  auto in = reinterpret_cast<float *>(inputs_.at(0)->Data());
+  auto out = reinterpret_cast<float *>(outputs_.at(0)->Data());
+  // size_t data_size = inputs_.at(0)->ElementsNum();
+
+  size_t nhw_size = 1;
+  size_t channels = bias_param->in_shape0_[bias_param->ndim_ - 1];  // C in NHWC
+  for (unsigned int i = 0; i < bias_param->ndim_ - 1; i++) nhw_size *= bias_param->in_shape0_[i];
+
+  size_t total_size = channels * nhw_size;
+  for (size_t c = 0; c < channels; ++c) {
+    out[c] = 0;
+    for (size_t offset = 0; offset < total_size; offset += channels) {
+      out[c] += in[offset + c];
+    }
+  }
+
+  return RET_OK;
+}
+
+
+kernel::LiteKernel *CpuBiasGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                 const std::vector<lite::tensor::Tensor *> &outputs,
+                                                 OpParameter *opParameter, const lite::Context *ctx,
+                                                 const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_BiasGrad);
+  auto *kernel = new  (std::nothrow) BiasGradCPUKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+
+  auto ret = kernel->Init();
+  if (RET_OK != ret) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BiasGrad, CpuBiasGradFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e7ac186924a805c31d99f59f334b8c5baf2a310
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+
+#include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"
+
+namespace mindspore::kernel {
+class BiasGradCPUKernel : public LiteKernel {
+ public:
+  explicit BiasGradCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                             const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs) {
+    bias_param = reinterpret_cast<ArithmeticParameter *>(parameter);
+  }
+  ~BiasGradCPUKernel() override = default;
+
+  int Init() override;
+  int InferShape();
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  ArithmeticParameter *bias_param;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9492b264479770ec5200bde1c7417eb115951ee
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.cc
@@ -0,0 +1,115 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <vector>
+#include "schema/model_generated.h"
+#include "src/kernel_factory.h"
+#include "src/runtime/kernel/arm/fp32/bngrad_input.h"
+#include "src/runtime//kernel/arm/opclib/batch_norm.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+// using mindspore::lite::REG_OP;
+using mindspore::schema::PrimitiveType_BNGradInput;
+
+namespace mindspore::kernel {
+int BNGradInputCPUKernel::Init() {
+  auto bn_param = reinterpret_cast<bnParameter *>(opParameter);
+  workspace_size = 5 * bn_param->channels;
+  workspace = new float[workspace_size];
+
+  if (2 != this->inputs_.size()) {
+    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
+    return RET_ERROR;
+  }
+  if (1 != this->outputs_.size()) {
+    MS_LOG(ERROR) << "Conv2d Grad should has one output";
+    return RET_ERROR;
+  }
+  auto *input_tensor = inputs_.at(0);
+  // auto *weight_tensor = inputs_.at(1);
+  auto *out_tensor = outputs_.at(0);
+  auto in_shape = input_tensor->shape();
+  out_tensor->set_shape(in_shape);
+  out_tensor->set_data_type(input_tensor->data_type());
+  return RET_OK;
+}
+
+int BNGradInputCPUKernel::ReSize() { return RET_OK; }
+
+/*
+according to https://wiseodd.github.io/techblog/2016/07/04/batchnorm
+*/
+
+int BNGradInputCPUKernel::Run() {
+  // std::cout << "run succ" << std::endl;
+  auto *input_x = inputs_.at(0);
+  auto *input_yt = inputs_.at(1);
+  auto *input_scale = inputs_.at(2);
+  auto *output_grad = outputs_.at(0);
+  // Tensor *bias = input[5];
+  auto bn_param = reinterpret_cast<bnParameter *>(opParameter);
+  int batch = bn_param->batch;
+  int channels = bn_param->channels;
+  int spatial = bn_param->spatial;
+  float eps = bn_param->eps;
+  std::fill(workspace, workspace + workspace_size, 0.f);
+
+  float *mean = workspace;
+  float *variance = mean + channels;
+  float *mean_delta = variance + channels;
+  float *variance_delta = mean_delta + channels;
+  float *mean_add_delta = variance_delta + channels;
+
+  float *x = reinterpret_cast<float *>(input_x->Data());
+  float *yt = reinterpret_cast<float *>(input_yt->Data());
+  float *scale = reinterpret_cast<float *>(input_scale->Data());
+  float *out = reinterpret_cast<float *>(output_grad->Data());
+
+  std::copy(yt, yt + batch * channels * spatial, out);
+  meanVar(x, batch, spatial, channels, mean, variance);
+  scaleBias(scale, batch, channels, spatial, out);
+  meanDelta(out, spatial, channels, eps, variance, mean_delta);
+  varianceDelta(x, out, mean, variance, batch, channels, spatial, eps, variance_delta);
+  meanAdd(x, mean, variance_delta, batch, channels, spatial, mean_add_delta, mean_delta);
+  NormalizeDelta(x, mean, variance, mean_delta, variance_delta, batch, channels, eps, spatial, out);
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuBNGradInputFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                    const std::vector<lite::tensor::Tensor *> &outputs,
+                                                    OpParameter *opParameter, const lite::Context *ctx,
+                                                    const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_BNGradInput);
+  //  parameter->name = opDef.name()->str().data();
+  //  parameter->type = opDef.attr_type();
+  auto *kernel = new (std::nothrow) BNGradInputCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+  auto ret = kernel->Init();
+  if (RET_OK != ret) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BNGradInput, CpuBNGradInputFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.h b/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4e6d6e746263ec2b5375ca9ef56288d097c29d0
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BNGRAD_INPUT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BNGRAD_INPUT_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+
+namespace mindspore::kernel {
+class BNGradInputCPUKernel : public LiteKernel {
+ public:
+  explicit BNGradInputCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs) {}
+  ~BNGradInputCPUKernel() override { delete workspace; }
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  float *workspace;
+  int workspace_size;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BNGRAD_INPUT_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89cba44768b981c9ed4a3b18195e296cc906eaa1
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.cc
@@ -0,0 +1,156 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp32/convolution_grad_filter.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/arm/opclib/pack.h"
+#include "src/runtime/kernel/arm/opclib/pack_ext.h"
+#include "src/runtime/kernel/arm/opclib/fp32/gemm.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Conv2DGradFilter;
+
+namespace mindspore::kernel {
+int ConvolutionGradFilterCPUKernel::Init() {
+  // dy is in input 0
+  // x is in input 1
+  // dw is output 0
+
+  if (2 != this->inputs_.size()) {
+    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
+    return RET_ERROR;
+  }
+  if (1 != this->outputs_.size()) {
+    MS_LOG(ERROR) << "Conv2d Grad should has one output";
+    return RET_ERROR;
+  }
+
+  auto *input_tensor = inputs_.at(1);
+  MS_ASSERT(input_tensor != nullptr);
+  auto *dy = inputs_.at(0);
+  MS_ASSERT(dy != nullptr);
+  auto *weight_tensor = outputs_.at(0);
+  MS_ASSERT(weight_tensor != nullptr);
+
+  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
+  conv_param->output_batch_ = this->inputs_.at(0)->shape().at(kNHWC_N);
+  conv_param->input_batch_ = this->inputs_.at(1)->shape().at(kNHWC_N);
+  conv_param->input_h_ = this->inputs_.at(1)->shape().at(kNHWC_H);
+  conv_param->input_w_ = this->inputs_.at(1)->shape().at(kNHWC_W);
+  // assume OutCh|kh|kw|In
+  conv_param->input_channel_ = this->inputs_.at(1)->shape().at(kNHWC_C);
+  conv_param->output_channel_ = this->outputs_.at(0)->shape().at(kNHWC_N);
+
+  int ws_size = conv_param->output_h_ * conv_param->output_w_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
+                conv_param->input_channel_ / conv_param->group_;
+
+  workspace = new float[ws_size];
+
+  int output_w = 0;
+  int output_h = 0;
+  output_h = dy->shape()[kNHWC_H];
+  output_w = dy->shape()[kNHWC_W];
+
+  std::vector<int> out_shape(4);
+  out_shape.at(0) = conv_param->output_channel_;
+  out_shape.at(1) = conv_param->kernel_h_;
+  out_shape.at(2) = conv_param->kernel_w_;
+  out_shape.at(3) = conv_param->input_channel_ / conv_param->group_;
+
+  // weight is output
+  weight_tensor->set_shape(out_shape);
+  weight_tensor->set_data_type(input_tensor->data_type());
+
+  conv_param->output_h_ = output_h;
+  conv_param->output_w_ = output_w;
+
+  return RET_OK;
+}
+
+int ConvolutionGradFilterCPUKernel::ReSize() { return 0; }
+
+int ConvolutionGradFilterCPUKernel::Run() {
+  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
+  auto *input_dy = inputs_.at(0);
+  auto *input_x = inputs_.at(1);
+  auto *out_dw = outputs_.at(0);
+
+  auto x_addr = reinterpret_cast<float *>(input_x->Data());
+  auto dy_addr = reinterpret_cast<float *>(input_dy->Data());
+  auto dw_addr = reinterpret_cast<float *>(out_dw->Data());
+
+  int i, j;
+  int nweights = out_dw->ElementsNum();
+  int in_ch = conv_param->input_channel_;
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;  // out_dw->shape()[1];
+  int k_w = conv_param->kernel_w_;  // out_dw->shape()[2];
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int groups = conv_param->group_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int n = k_h * k_w * in_ch / groups;
+  int k = out_ch / groups;
+
+  // zero out pointer
+  memset(dw_addr, 0, out_dw->Size());
+
+  for (i = 0; i < batch; ++i) {
+    for (j = 0; j < groups; ++j) {
+      float *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups);
+      float *mat_b = workspace;
+      float *mat_c = dw_addr + j * nweights / groups;
+      float *im = x_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups);
+
+      im2row_hwc(im, mat_b, conv_param);
+      gemm(1, 1, k, n, m, 1, mat_a, out_ch, mat_b, m, 1, mat_c, n);
+    }
+  }
+
+  // std::cout << "run succ" << std::endl;
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuConvGradFilterFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                       const std::vector<lite::tensor::Tensor *> &outputs,
+                                                       OpParameter *opParameter, const lite::Context *ctx,
+                                                       const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DGradFilter);
+
+  auto *kernel = new (std::nothrow) ConvolutionGradFilterCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+
+  auto ret = kernel->Init();
+  if (RET_OK != ret) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2DGradFilter, CpuConvGradFilterFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c32a798eafada3d3afc65560601b6866db5e386a
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_FILTER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_FILTER_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+
+namespace mindspore::kernel {
+class ConvolutionGradFilterCPUKernel : public LiteKernel {
+ public:
+  explicit ConvolutionGradFilterCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                          const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs) {}
+  ~ConvolutionGradFilterCPUKernel() override { delete workspace; }
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  float *workspace;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_FILTER_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29bb49a8c8f726c3a6ff564d62ba6d224a0ec24b
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.cc
@@ -0,0 +1,136 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp32/convolution_grad_input.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/arm/opclib/pack.h"
+#include "src/runtime/kernel/arm/opclib/pack_ext.h"
+#include "src/runtime/kernel/arm/opclib/fp32/gemm.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::schema::PrimitiveType_Conv2DGradInput;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+int ConvolutionGradInputCPUKernel::Init() {
+  if (2 != this->inputs_.size()) {
+    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
+    return RET_ERROR;
+  }
+  if (1 != this->outputs_.size()) {
+    MS_LOG(ERROR) << "Conv2d Grad should has one output";
+    return RET_ERROR;
+  }
+
+  auto *dy_tensor = inputs_.at(kInputIndex);
+  MS_ASSERT(dy_tensor != nullptr);
+  auto *weight_tensor = inputs_.at(kWeightIndex);
+  MS_ASSERT(weight_tensor != nullptr);
+  auto *dx_tensor = outputs_.at(kOutputIndex);
+  MS_ASSERT(dx_tensor != nullptr);
+
+  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
+  conv_param->output_batch_ = dx_tensor->shape()[(kNHWC_N)];
+  conv_param->input_batch_ = dy_tensor->shape()[(kNHWC_N)];
+
+  conv_param->input_h_ = dx_tensor->shape()[(kNHWC_H)];
+  conv_param->input_w_ = dx_tensor->shape()[(kNHWC_W)];
+
+  // assume OutCh|kh|kw|In
+  conv_param->input_channel_ = dx_tensor->shape()[(kNHWC_C)];
+  conv_param->output_channel_ = weight_tensor->shape()[(kNHWC_N)];
+
+  // TBD
+  conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
+  conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];
+
+  int ws_size = conv_param->output_h_ * conv_param->output_w_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
+                conv_param->input_channel_ / conv_param->group_;
+
+  workspace = new float[ws_size];
+  return 0;
+}
+
+int ConvolutionGradInputCPUKernel::ReSize() { return 0; }
+
+int ConvolutionGradInputCPUKernel::Run() {
+  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
+  auto *input_dy = inputs_.at(0);
+  auto *input_w = inputs_.at(1);
+  auto *out_dx = outputs_.at(0);
+
+  auto dy_addr = reinterpret_cast<float *>(input_dy->Data());
+  auto w_addr = reinterpret_cast<float *>(input_w->Data());
+  auto dx_addr = reinterpret_cast<float *>(out_dx->Data());
+
+  int i, j;
+  int nweights = input_w->ElementsNum();
+  int in_ch = conv_param->input_channel_;
+  int in_h = conv_param->input_h_;
+  int in_w = conv_param->input_w_;
+  int k_h = conv_param->kernel_h_;  // out_dw->shape()[1];
+  int k_w = conv_param->kernel_w_;  // out_dw->shape()[2];
+  int batch = conv_param->output_batch_;
+  int out_ch = conv_param->output_channel_;
+  int groups = conv_param->group_;
+  int out_h = conv_param->output_h_;
+  int out_w = conv_param->output_w_;
+
+  int m = out_h * out_w;
+  int n = k_w * k_h * in_ch / groups;
+  int k = out_ch / groups;
+
+  memset(dx_addr, 0, sizeof(float) * batch * in_ch * in_h * in_w);
+
+  for (i = 0; i < batch; ++i) {
+    for (j = 0; j < groups; ++j) {
+      float *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups);
+      float *mat_b = w_addr + j * nweights / groups;
+      float *mat_c = workspace;
+      gemm(0, 0, m, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n);
+      col2im_hwc(mat_c, dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups), conv_param);
+    }
+  }
+
+  // std::cout << "run succ" << std::endl;
+  return 0;
+}
+
+kernel::LiteKernel *CpuConvGradInputFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                      const std::vector<lite::tensor::Tensor *> &outputs,
+                                                      OpParameter *opParameter, const lite::Context *ctx,
+                                                      const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DGradInput);
+
+  auto *kernel = new (std::nothrow) ConvolutionGradInputCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+
+  auto ret = kernel->Init();
+  if (0 != ret) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2DGradInput, CpuConvGradInputFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.h
new file mode 100644
index 0000000000000000000000000000000000000000..86901b37ba5b7468105ff1f90e433e59806c5aa6
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_INPUT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_INPUT_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+
+namespace mindspore::kernel {
+class ConvolutionGradInputCPUKernel : public LiteKernel {
+ public:
+  explicit ConvolutionGradInputCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                         const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs) {}
+  ~ConvolutionGradInputCPUKernel() override { delete workspace; }
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  float *workspace;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_INPUT_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84c51509ba0f34630e85a60c5a60e79c45924e0d
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.cc
@@ -0,0 +1,78 @@
+
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/arm/fp32/opt_momentum.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::schema::PrimitiveType_OptMomentum;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+
+int OptMomentumCPUKernel::ReSize() { return 0; }
+
+int OptMomentumCPUKernel::Run() {
+  if (inputs_.size() != 5 || !outputs_.empty()) {
+    MS_LOG(ERROR) << "OptMomentumCPUKernel error input output size!";
+    return RET_ERROR;
+  }
+
+  if (inputs_[0]->ElementsNum() != inputs_[1]->ElementsNum() ||
+      inputs_[0]->ElementsNum() != inputs_[3]->ElementsNum()) {
+    MS_LOG(ERROR) << "error input data size!";
+    return RET_ERROR;
+  }
+  auto weight = reinterpret_cast<float *>(inputs_[0]->Data());
+  auto accumulate = reinterpret_cast<float *>(inputs_[1]->Data());
+  float learning_rate = reinterpret_cast<float *>(inputs_[2]->Data())[0];
+  auto gradient = reinterpret_cast<float *>(inputs_[3]->Data());
+  float moment = reinterpret_cast<float *>(inputs_[4]->Data())[0];
+  size_t elem_num = inputs_[0]->ElementsNum();
+  for (size_t i = 0; i < elem_num; ++i) {
+    accumulate[i] = accumulate[i] * moment + gradient[i];
+    weight[i] -= accumulate[i] * learning_rate;
+  }
+  return RET_OK;
+}
+
+int OptMomentumCPUKernel::Init() { return 0; }
+
+kernel::LiteKernel *CpuOptMomentumFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                    const std::vector<lite::tensor::Tensor *> &outputs,
+                                                    OpParameter *opParameter, const lite::Context *ctx,
+                                                    const kernel::KernelKey &desc) {
+  MS_ASSERT(desc.type == schema::PrimitiveType_OptMomentum);
+  auto *kernel = new (std::nothrow) OptMomentumCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+
+  auto ret = kernel->Init();
+  if (0 != ret) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_OptMomentum, CpuOptMomentumFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.h b/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.h
new file mode 100644
index 0000000000000000000000000000000000000000..6746b5bf0b1c4d62c3406e0845ab87648034c293
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_BACKEND_ARM_FP32_OPT_MOMENTUM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPT_MOMENTUM_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+
+namespace mindspore::kernel {
+class OptMomentumCPUKernel : public LiteKernel {
+ public:
+  explicit OptMomentumCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                   const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs) {}
+  ~OptMomentumCPUKernel() override {}
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPT_MOMENTUM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b10cbf369db6f6034cfeb628eb9fb0f4020dce1
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.cc
@@ -0,0 +1,195 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp32/pooling_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/arm/opclib/fp32/pooling.h"
+#include "src/runtime/kernel/arm/opclib/fp32/pooling_grad.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_PoolingGrad;
+
+namespace mindspore::kernel {
+#if 0
+int PoolingGradCPUKernel::TfPadding(int input_w, int input_h, int &output_w, int &output_h) {
+  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *> (opParameter);
+
+  auto stride_w = pool_param->stride_w_;
+  auto stride_h = pool_param->stride_h_;
+  auto window_w = pool_param->window_w_;
+  auto window_h = pool_param->window_h_;
+  auto pad_up = pool_param->pad_u_;
+  auto pad_down = pool_param->pad_d_;
+  auto pad_left = pool_param->pad_l_;
+  auto pad_right = pool_param->pad_r_;
+  if (pool_param->pad_mode_ == PADMODE_SAME) {
+    output_w = ceil(input_w / stride_w);
+    output_h = ceil(input_h / stride_h);
+  } else {
+    output_w = ceil((input_w + pad_left + pad_right - window_w + 1) / stride_w);
+    output_h = ceil((input_h + pad_up + pad_down - window_h + 1) / stride_h);
+  }
+  return RET_OK;
+}
+
+int PoolingGradCPUKernel::CaffePadding(int input_w, int input_h, int &output_w, int &output_h) {
+  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *> (opParameter);
+
+  auto round_mode = pool_param->round_mode_;
+  auto stride_w = pool_param->stride_w_;
+  auto stride_h = pool_param->stride_h_;
+  auto window_w = pool_param->window_w_;
+  auto window_h = pool_param->window_h_;
+  auto pad_up = pool_param->pad_u_;
+  auto pad_down = pool_param->pad_d_;
+  auto pad_left = pool_param->pad_l_;
+  auto pad_right = pool_param->pad_r_;
+  if (round_mode == ROUNDMODE_FLOOR && false) {
+    output_w = floor((input_w + pad_left + pad_right - window_w) / stride_w + 1);
+    output_h = floor((input_h + pad_up + pad_down - window_h) / stride_h + 1);
+  } else if (round_mode == ROUNDMODE_CEIL || true) {
+    output_w = ceil((input_w + pad_left + pad_right - window_w) / stride_w + 1);
+    output_h = ceil((input_h + pad_up + pad_down - window_h) / stride_h + 1);
+  } else {
+    MS_LOG(ERROR) << "round mode not support.";
+  }
+
+  if (pad_left > 0 || pad_up > 0) {
+    if ((output_w - 1) * stride_w >= input_w + pad_left) {
+      --output_w;
+    }
+    if ((output_h - 1) * stride_h >= input_h + pad_up) {
+      --output_h;
+    }
+  }
+  return RET_OK;
+}
+
+int PoolingGradCPUKernel::OnnxPadding(int input_w, int input_h, int &output_w, int &output_h) {
+  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *> (opParameter);
+
+  auto round_mode = pool_param->round_mode_;
+  auto stride_w = pool_param->stride_w_;
+  auto stride_h = pool_param->stride_h_;
+  auto window_w = pool_param->window_w_;
+  auto window_h = pool_param->window_h_;
+  auto pad_up = pool_param->pad_u_;
+  auto pad_down = pool_param->pad_d_;
+  auto pad_left = pool_param->pad_l_;
+  auto pad_right = pool_param->pad_r_;
+  if (round_mode == ROUNDMODE_FLOOR) {
+    output_w = floor((input_w + pad_left + pad_right - window_w) / stride_w + 1);
+    output_h = floor((input_h + pad_up + pad_down - window_h) / stride_h + 1);
+  } else if (round_mode == ROUNDMODE_CEIL) {
+    MS_LOG(ERROR) << "RoundMode_CEIL mode not support.";
+  } else {
+    MS_LOG(ERROR) << "OnnxPadding round mode not support.";
+  }
+  return RET_OK;
+}
+#endif
+
+int PoolingGradCPUKernel::Init() {
+  // InferShape():
+  // auto *in_tensor = reinterpret_cast<float *>(inputs_.at(0)->Data());
+  // auto *x_tensor = reinterpret_cast<float *>(inputs_.at(1)->Data());
+
+  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(opParameter);
+
+  auto in_shape = inputs_.at(0)->shape();
+  int input_h = in_shape.at(1);
+  int input_w = in_shape.at(2);
+
+  if (pool_param->global_) {
+    pool_param->window_w_ = input_w;
+    pool_param->window_h_ = input_h;
+  }
+
+  // Emir -- here I assume we get the outputshape in the output tensor
+  auto *out_tensor = outputs_.front();
+  auto out_shape = out_tensor->shape();
+
+#if 0
+  int output_w = 0, output_h = 0;
+  auto fmk_type = pool_param->fmk_type_;
+  switch (fmk_type) {
+    case lite::FmkType_TF:
+      break;
+    case lite::FmkType_CAFFE:
+      CaffePadding(input_w, input_h, output_w, output_h);
+      break;
+    case lite::FmkType_ONNX:
+      OnnxPadding(input_w, input_h, output_w, output_h);
+      break;
+    case lite::FmkType_MS:
+      break;
+    case lite::FmkType_TFLITE:
+      TfPadding(input_w, input_h, output_w, output_h);
+      break;
+    default:
+      MS_LOG(ERROR) << "Not support this framework.";
+  }
+  std::vector<int> out_shape{in_tensor->shape()};
+  out_shape.at(1) = output_h;
+  out_shape.at(2) = output_w;
+#endif
+  out_tensor->set_shape(out_shape);
+  out_tensor->set_data_type(inputs_.at(0)->data_type());
+  return RET_OK;
+}
+
+int PoolingGradCPUKernel::ReSize() { return RET_OK; }
+
+int PoolingGradCPUKernel::Run() {
+  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(opParameter);
+  auto input_ptr = reinterpret_cast<float *>(inputs_.at(0)->Data());
+  auto output_ptr = reinterpret_cast<float *>(outputs_.at(0)->Data());
+
+  if (pool_param->max_pooling_) {
+    auto ind = reinterpret_cast<int *>(inputs_.at(1)->Data());
+    MaxPoolingGrad(input_ptr, ind, output_ptr, pool_param);
+  } else {
+    AvgPoolingGrad(input_ptr, output_ptr, pool_param);
+  }
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuPoolingGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                    const std::vector<lite::tensor::Tensor *> &outputs,
+                                                    OpParameter *opParameter, const lite::Context *ctx,
+                                                    const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_PoolingGrad);
+
+  auto *kernel = new (std::nothrow) PoolingGradCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+  auto ret = kernel->Init();
+  if (RET_OK != ret) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_PoolingGrad, CpuPoolingGradFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..eec333d8600fd16a907c07d996cc1fcd3443302f
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POOLING_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POOLING_GRAD_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+
+namespace mindspore::kernel {
+using mindspore::schema::PadMode;
+using mindspore::schema::PoolMode;
+using mindspore::schema::QuantType;
+using mindspore::schema::RoundMode;
+
+class PoolingGradCPUKernel : public LiteKernel {
+ public:
+  explicit PoolingGradCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                                const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs) {}
+  ~PoolingGradCPUKernel() override = default;
+
+  // int TfPadding(int input_w, int input_h, int &output_w, int &output_h);
+  // int CaffePadding(int input_w, int input_h, int &output_w, int &output_h);
+  // int OnnxPadding(int input_w, int input_h, int &output_w, int &output_h);
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  uint8_t data_shape_{0};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POOLING_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..759cf8b43753d2667ed05ea5beeee7938790cf15
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.cc
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp32/power_grad.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_PowerGrad;
+
+namespace mindspore::kernel {
+int PowerGradCPUKernel::Init() { return RET_OK; }
+
+int PowerGradCPUKernel::ReSize() { return RET_OK; }
+
+int PowerGradCPUKernel::Run() {
+  auto dy_addr = reinterpret_cast<float *>(inputs_.at(0)->Data());
+  auto x_addr = reinterpret_cast<float *>(inputs_.at(1)->Data());
+  auto dx_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
+  auto size = inputs_.at(0)->ElementsNum();
+
+  Power(x_addr, dx_addr, size, power_ - 1, scale_, shift_);
+  ElementMul(dx_addr, dy_addr, dx_addr, size);
+  float scale = scale_ * power_;
+  for (int i = 0; i < size; i++) {
+    dx_addr[i] *= scale;
+  }
+
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuPowerGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                  const std::vector<lite::tensor::Tensor *> &outputs,
+                                                  OpParameter *opParameter, const lite::Context *ctx,
+                                                  const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_PowerGrad);
+  auto *kernel = new (std::nothrow) PowerGradCPUKernel(opParameter, inputs, outputs);
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_PowerGrad, CpuPowerGradFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.h b/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..316b55e1ebfc74dcfaa0c0035ab9a265f9181359
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POWER_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POWER_GRAD_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+#include "src/runtime/kernel/arm/opclib/power.h"
+
+namespace mindspore::kernel {
+class PowerGradCPUKernel : public LiteKernel {
+ public:
+  PowerGradCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
+                          const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(param, inputs, outputs) {
+        PowerParameter *power_param = reinterpret_cast<PowerParameter *>(param);
+        power_ = power_param->power_;
+        scale_ = power_param->scale_;
+        shift_ = power_param->shift_;
+  }
+  ~PowerGradCPUKernel() override = default;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  float power_;
+  float scale_;
+  float shift_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POWER_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..005b091fb66c4da760540a8277127ec89d96ef83
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.cc
@@ -0,0 +1,145 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.h"
+#include "src/runtime/kernel/arm/opclib/fp32/softmax.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_SoftmaxCrossEntropy;
+
+namespace mindspore::kernel {
+
+int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::ReSize() { return RET_OK; }
+
+void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const int *labels, const float *losses,
+                                                                      float *output) const {
+  float total_loss = 0;
+  for (int i = 0; i < param->batch_size_; ++i) {
+    if (labels[i] < 0) {
+      MS_LOG(EXCEPTION) << "label value must >= 0";
+    }
+    size_t label = labels[i];
+    if (label > param->number_of_classes_) {
+      MS_LOG(EXCEPTION) << "error label input!";
+    } else {
+      total_loss -= logf(losses[i * param->number_of_classes_ + label]);
+    }
+  }
+  output[0] = total_loss / param->batch_size_;
+}
+
+void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *labels, const float *losses,
+                                                                   float *output) const {
+  size_t row_start = 0;
+  for (int i = 0; i < param->batch_size_; ++i) {
+    if (labels[i] < 0) {
+      MS_LOG(EXCEPTION) << "label value must >= 0";
+    }
+    size_t label = labels[i];
+    if (label > param->number_of_classes_) {
+      MS_LOG(EXCEPTION) << "error label input!";
+    }
+    for (size_t j = 0; j < param->number_of_classes_; ++j) {
+      size_t index = row_start + j;
+      if (j == label) {
+        output[index] = (losses[index] - 1) / param->batch_size_;
+      } else {
+        output[index] = losses[index] / param->batch_size_;
+      }
+    }
+    row_start += param->number_of_classes_;
+  }
+}
+
+int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Run() {
+  auto ins = reinterpret_cast<float *>(inputs_.at(0)->Data());
+  auto labels = reinterpret_cast<int *>(inputs_.at(1)->Data());
+  auto out = reinterpret_cast<float *>(outputs_.at(0)->Data());
+  float *grads = NULL;
+  if (is_train()) {  // outputs_.size() > 1)
+    grads = reinterpret_cast<float *>(outputs_.at(0)->Data());
+  }
+  size_t data_size = inputs_.at(0)->ElementsNum();
+  float *losses = new (std::nothrow) float[data_size];
+  MS_ASSERT(losses != nullptr);
+  std::fill(losses, losses + data_size, 0);
+
+  MS_ASSERT(out != nullptr);
+  MS_ASSERT(labels != nullptr);
+  MS_ASSERT(ins != nullptr);
+
+  SoftmaxParameter sm_params;
+  sm_params.n_dim_ = param->n_dim_;
+  sm_params.element_size_ = data_size;
+  sm_params.axis_ = 1;
+  for (int i = 0; i < 4; i++)  // softmax has only 4 params in shape
+    sm_params.input_shape_[i] = param->input_shape_[i];
+  float sum_data[sm_params.input_shape_[sm_params.axis_]];
+  Softmax(ins, losses, sum_data, &sm_params);
+
+  if (is_train()) {
+    GradPostExecute(labels, losses, grads);
+  } else {
+    ForwardPostExecute(labels, losses, out);
+  }
+  return RET_OK;
+}
+
+int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Init() {
+  auto dims = inputs_[0]->shape();
+  param->n_dim_ = 2;
+  param->number_of_classes_ = dims[1];
+  param->batch_size_ = dims[0];
+  for (unsigned int i = 0; i < dims.size(); i++) param->input_shape_[i] = dims[i];
+  if (2 != this->inputs_.size()) {
+    MS_LOG(ERROR) << "softmax entropy loss should have two inputs";
+    return RET_ERROR;
+  }
+  auto *in0 = inputs_.front();
+  if (in0 == nullptr) {
+    MS_LOG(ERROR) << "softmax etropy loss in0 have no data";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuSoftmaxCrossEntropyFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                            const std::vector<lite::tensor::Tensor *> &outputs,
+                                                            OpParameter *opParameter, const lite::Context *ctx,
+                                                            const kernel::KernelKey &desc) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_SoftmaxCrossEntropy);
+  auto *kernel = new (std::nothrow) SparseSoftmaxCrossEntropyWithLogitsCPUKernel(opParameter, inputs, outputs);
+  MS_ASSERT(kernel != nullptr);
+  auto ret = kernel->Init();
+  if (RET_OK != ret) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SoftmaxCrossEntropy, CpuSoftmaxCrossEntropyFp32KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.h b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae9a4dd0a9b75ffd8116289ec2b81ab7c7fd657f
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+#include "src/runtime/kernel/arm/opclib/fp32/softmax_grad.h"
+#include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"
+
+namespace mindspore::kernel {
+
+class SparseSoftmaxCrossEntropyWithLogitsCPUKernel : public LiteKernel {
+ public:
+  explicit SparseSoftmaxCrossEntropyWithLogitsCPUKernel(OpParameter *parameter,
+                                                        const std::vector<lite::tensor::Tensor *> &inputs,
+                                                        const std::vector<lite::tensor::Tensor *> &outputs)
+      : LiteKernel(parameter, inputs, outputs) {
+    param = reinterpret_cast<SoftmaxCrossEntropyParameter *>(parameter);
+  }
+  ~SparseSoftmaxCrossEntropyWithLogitsCPUKernel() override = default;
+
+  void ForwardPostExecute(const int *labels, const float *losses, float *output) const;
+  void GradPostExecute(const int *labels, const float *losses, float *output) const;
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  SoftmaxCrossEntropyParameter *param;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/activation_grad.h b/mindspore/lite/src/runtime/kernel/arm/opclib/activation_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac49b8567aa941cd2852f8d25a5210d857512805
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/activation_grad.h
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_ACTIVATION_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_ACTIVATION_GRAD_H_
+
+#include <math.h>
+#include "src/runtime/kernel/arm/opclib/op_base.h"
+#include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"
+#include "src/runtime/kernel/arm/opclib/errorcode.h"
+
+struct ActivationGradParameter {
+  OpParameter op_parameter{};
+  int type_;
+  float alpha_{0.01};
+};
+
+inline int ReluGrad(float *src0, float *src1, int length, float *dst) {
+  for (int i = 0; i < length; ++i) {
+    dst[i] = src1[i] > 0 ? 1.0f : 0.0f;
+  }
+  ElementMul(src0, dst, dst, length);
+  return OPCLIB_OK;
+}
+
+inline int Relu6Grad(float *src0, float *src1, int length, float *dst) {
+  for (int i = 0; i < length; ++i) {
+    if (src1[i] < 0) {
+      dst[i] = 0;
+    } else {
+      dst[i] = src1[i] > 6.0f ? 0.0f : 1.0f;
+    }
+  }
+  ElementMul(src0, dst, dst, length);
+  return OPCLIB_OK;
+}
+
+inline int LReluGrad(float *src0, float *src1, int length, float *dst, float alpha) {
+  for (int i = 0; i < length; ++i) {
+    dst[i] = src1[i] > 0.0f ? 1.0f : alpha;
+  }
+  ElementMul(src0, dst, dst, length);
+  return OPCLIB_OK;
+}
+
+inline int SigmoidGrad(float *src0, float *src1, int length, float *dst) {
+  for (int i = 0; i < length; ++i) {
+    dst[i] = src0[i] * (src1[i] * (1.0f - src1[i]));
+  }
+  return OPCLIB_OK;
+}
+
+inline int TanhGrad(float *src0, float *src1, int length, float *dst) {
+  for (int i = 0; i < length; ++i) {
+    dst[i] = (1.0f - (src1[i] * src1[i])) * src0[i];
+  }
+  return OPCLIB_OK;
+}
+
+inline int HSwishGrad(float *src0, float *src1, int length, float *dst) {
+  for (int i = 0; i < length; ++i) {
+    float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f));
+    dst[i] = tmp * src0[i];
+  }
+  return OPCLIB_OK;
+}
+
+inline int HSigmoidGrad(float *src0, float *src1, int length, float *dst) {
+  for (int i = 0; i < length; ++i) {
+    float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f));
+    dst[i] = tmp * src0[i];
+  }
+  return OPCLIB_OK;
+}
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_ACTIVATION_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c11819a1d9e2dd9dd000b38d20cfb2d091496dd6
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.cc
@@ -0,0 +1,120 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <cmath>
+#include "src/runtime/kernel/arm/opclib/batch_norm.h"
+
+static void sumSpatialBatch(const float *in, int size, int ch, float *out) {
+  std::fill(out, out + ch, 0.f);
+  for (int i = 0; i < size; i++) {
+    const float *ptr = in + i * ch;
+    for (int c = 0; c < ch; c++) {
+      out[c] += ptr[c];
+    }
+  }
+}
+
+void scaleBias(const float *scales, int batch, int n, int size, float *output) {
+  for (int i = 0; i < batch * size; i++)
+    for (int c = 0; c < n; c++) output[i * n + c] *= scales[c];
+}
+
+void normalize(const float *x, const float *mean, const float *variance, float eps, int batch, int filters, int spatial,
+               float *out) {
+  int b, f, i;
+  for (b = 0; b < batch; ++b) {
+    for (i = 0; i < spatial; ++i) {
+      for (f = 0; f < filters; ++f) {
+        int index = b * filters * spatial + i * filters + f;
+        out[index] = (x[index] - mean[f]) / (std::sqrt(variance[f]) + eps);
+      }
+    }
+  }
+}
+
+void backwardScale(const float *x_norm, const float *delta, int batch, int n, int size, float *scale_updates) {
+  int i, b, f;
+  std::fill(scale_updates, scale_updates + n, 0.f);
+  for (b = 0; b < batch; ++b) {
+    for (i = 0; i < size; ++i) {
+      for (f = 0; f < n; ++f) {
+        int index = (b * size + i) * n + f;
+        scale_updates[f] += delta[index] * x_norm[index];
+      }
+    }
+  }
+}
+
+void meanVar(const float *in, int batch, int spatial, int ch, float *mean, float *var) {
+  float N = batch * spatial;
+  sumSpatialBatch(in, N, ch, mean);
+  for (int f = 0; f < ch; ++f) mean[f] /= N;
+  std::fill(var, var + ch, 0.f);
+  for (int i = 0; i < N; i++) {
+    for (int f = 0; f < ch; f++) {
+      float x = in[i * ch + f];
+      var[f] += (x - mean[f]) * (x - mean[f]);
+    }
+  }
+  for (int f = 0; f < ch; f++) var[f] /= N;
+}
+
+void meanDelta(float *yt, int size, int ch, float eps, float *variance, float *mean_delta) {
+  sumSpatialBatch(yt, size, ch, mean_delta);
+  for (int i = 0; i < ch; i++) mean_delta[i] *= -1.f / std::sqrt((variance[i] + eps));
+}
+
+void meanAdd(const float *x, const float *mean, const float *variance_delta, int batch, int filters, int spatial,
+             float *mean_add, float *mean_delta) {
+  int i, k;
+  std::fill(mean_add, mean_add + filters, 0.f);
+  for (k = 0; k < spatial * batch; ++k) {
+    for (i = 0; i < filters; ++i) {
+      int index = k * filters + i;
+      mean_add[i] += x[index] - mean[i];
+    }
+  }
+  for (i = 0; i < filters; ++i) {
+    mean_add[i] *= variance_delta[i] * (-2.f / (spatial * batch));
+    mean_delta[i] += mean_add[i];
+  }
+}
+
+void varianceDelta(const float *x, const float *delta, const float *mean, const float *variance, int batch, int filters,
+                   int spatial, float eps, float *variance_delta) {
+  int i, k;
+  std::fill(variance_delta, variance_delta + filters, 0.f);
+  for (k = 0; k < batch * spatial; k++) {
+    for (i = 0; i < filters; i++) {
+      int index = k * filters + i;
+      variance_delta[i] += delta[index] * (x[index] - mean[i]);
+    }
+  }
+  for (i = 0; i < filters; i++) variance_delta[i] *= -.5 * pow(variance[i] + eps, (-3.f / 2.f));
+}
+
+void NormalizeDelta(const float *x, const float *mean, const float *variance, const float *mean_delta,
+                    const float *variance_delta, int batch, int filters, int spatial, float eps, float *delta) {
+  int f, k;
+  for (k = 0; k < batch * spatial; k++) {
+    for (f = 0; f < filters; f++) {
+      int index = k * filters + f;
+      delta[index] = delta[index] * 1. / (std::sqrt(variance[f] + eps)) +
+                     variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) +
+                     mean_delta[f] / (spatial * batch);
+    }
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.h b/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d9e8b74bff14059798423167ee847a44f78ee4a
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_BACKEND_ARM_BATCH_NORM_H_
+#define MINDSPORE_LITE_SRC_BACKEND_ARM_BATCH_NORM_H_
+
+struct bnParameter {
+  int batch;
+  int channels;
+  int spatial;
+  float eps;
+};
+void scaleBias(const float *scales, int batch, int n, int size, float *output);
+void normalize(const float *x, const float *mean, const float *variance, float eps, int batch, int filters, int spatial,
+               float *out);
+void backwardScale(const float *x_norm, const float *delta, int batch, int n, int size, float *scale_updates);
+void meanVar(const float *in, int batch, int size, int ch, float *mean, float *var);
+void meanDelta(float *yt, int size, int ch, float eps, float *variance, float *mean_delta);
+void varianceDelta(const float *x, const float *delta, const float *mean, const float *variance, int batch, int ch,
+                   int spatial, float eps, float *variance_delta);
+void meanAdd(const float *x, const float *mean, const float *variance_delta, int batch, int filters, int spatial,
+             float *mean_add, float *mean_delta);
+void NormalizeDelta(const float *x, const float *mean, const float *variance, const float *mean_delta,
+                    const float *variance_delta, int batch, int filters, int spatial, float eps, float *delta);
+
+#endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4baec49d028a9afef28cbe3cd7deace87fe68e5f
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.cc
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h"
+
+void ElementDivNegSquare(const float *nom, const float *denom, float *output, int element_size) {
+  for (int i = 0; i < element_size; i++) {
+    output[i] = -nom[i] / (denom[i] * denom[i]);
+  }
+}
+
+void ElementMulAndDivNegSquare(const float *a, const float *b, const float *denom, float *output, int element_size) {
+  for (int i = 0; i < element_size; i++) {
+    output[i] = -a[i] * b[i] / (denom[i] * denom[i]);
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ce96e669b40867a5846bde8cad76cc3b4cbafe8
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h
@@ -0,0 +1,22 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_ARITHMETIC_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_ARITHMETIC_GRAD_H_
+
+void ElementDivNegSquare(const float *nom, const float *denom, float *output, int element_size);
+void ElementMulAndDivNegSquare(const float *a, const float *b, const float *denom, float *output, int element_size);
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_ARITHMETIC_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8f3b2afd4e56b46939d7db672b07d237abb3ee7
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/opclib/fp32/gemm.h"
+
+static void gemm_nn(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_B, int ldb, float *mat_c,
+                    int ldc) {
+  int i, j, k;
+  for (i = 0; i < M; ++i) {
+    for (k = 0; k < K; ++k) {
+      float a = alpha * mat_a[i * lda + k];
+      for (j = 0; j < N; ++j) {
+        mat_c[i * ldc + j] += a * mat_B[k * ldb + j];
+      }
+    }
+  }
+}
+
+static void gemm_nt(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b, int ldb, float *mat_c,
+                    int ldc) {
+  int i, j, k;
+  for (i = 0; i < M; ++i) {
+    for (j = 0; j < N; ++j) {
+      float sum = 0;
+      for (k = 0; k < K; ++k) {
+        sum += alpha * mat_a[i * lda + k] * mat_b[j * ldb + k];
+      }
+      mat_c[i * ldc + j] += sum;
+    }
+  }
+}
+
+static void gemm_tn(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b, int ldb, float *mat_c,
+                    int ldc) {
+  int i, j, k;
+  for (i = 0; i < M; ++i) {
+    for (k = 0; k < K; ++k) {
+      float a = alpha * mat_a[k * lda + i];
+      for (j = 0; j < N; ++j) {
+        mat_c[i * ldc + j] += a * mat_b[k * ldb + j];
+      }
+    }
+  }
+}
+
+static void gemm_tt(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b, int ldb, float *mat_c,
+                    int ldc) {
+  int i, j, k;
+  for (i = 0; i < M; ++i) {
+    for (j = 0; j < N; ++j) {
+      float sum = 0;
+      for (k = 0; k < K; ++k) {
+        sum += alpha * mat_a[i + k * lda] * mat_b[k + j * ldb];
+      }
+      mat_c[i * ldc + j] += sum;
+    }
+  }
+}
+
+// mat_c = alpha*op( mat_a )*op( mat_b ) + beta*C
+// M - number of rows of matrix a
+// N - number of cols of matrix b
+// K - number of cols of matrix a
+
+void gemm(int transpose_a, int transpose_b, int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b,
+          int ldb, float beta, float *mat_c, int ldc) {
+  // printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
+  if (beta >= 0.f && beta <= 0.f) {
+    for (int i = 0; i < M; ++i) {
+      for (int j = 0; j < N; ++j) {
+        mat_c[i * ldc + j] = 0;
+      }
+    }
+  } else if (beta < 1.f || beta > 1.f) {
+    for (int i = 0; i < M; ++i) {
+      for (int j = 0; j < N; ++j) {
+        mat_c[i * ldc + j] *= beta;
+      }
+    }
+  }
+
+  int t;
+
+  for (t = 0; t < M; ++t) {
+    if (!transpose_a && !transpose_b) {
+      gemm_nn(1, N, K, alpha, mat_a + t * lda, lda, mat_b, ldb, mat_c + t * ldc, ldc);
+    } else if (transpose_a && !transpose_b) {
+      gemm_tn(1, N, K, alpha, mat_a + t, lda, mat_b, ldb, mat_c + t * ldc, ldc);
+    } else if (!transpose_a && transpose_b) {
+      gemm_nt(1, N, K, alpha, mat_a + t * lda, lda, mat_b, ldb, mat_c + t * ldc, ldc);
+    } else {
+      gemm_tt(1, N, K, alpha, mat_a + t, lda, mat_b, ldb, mat_c + t * ldc, ldc);
+    }
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..8caf05755f6a0d7f557f9c7522fcce66e99d69f4
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.h
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_GEMM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_GEMM_H_
+
+void gemm(int transpose_a, int transpose_b, int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b,
+          int ldb, float beta, float *mat_c, int ldc);
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_GEMM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2730cbe95d3fb2784f5042ccf79093f8715d1a0c
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.cc
@@ -0,0 +1,149 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstdint>
+#include "src/runtime/kernel/arm/opclib/fp32/pooling_grad.h"
+
+void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param) {
+  int stride_w = pooling_param->stride_w_;
+  int stride_h = pooling_param->stride_h_;
+  int pad_w = pooling_param->pad_l_;
+  int pad_h = pooling_param->pad_u_;
+  int win_w = pooling_param->window_w_;
+  int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int output_batch = pooling_param->output_batch_;
+
+  const float *inPtr;
+  for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0;
+
+  // int pad_top = padding[2];
+
+  float kk = static_cast<float>(win_h * win_w);
+
+  for (uint16_t ib = 0; ib < output_batch; ib++) {
+    // int in_batch_offset = batch * in_h * in_w * channel;
+    // int out_batch_offset = batch * output_h * output_w * channel;
+    // out = grads->getData(ib*grads->imgSize());
+    // inPtr = in->getData(ib*in->imgSize());
+    float *out;
+    out = &output_ptr[(ib * output_h * output_w)];
+    inPtr = reinterpret_cast<const float *>(&input_ptr[(ib * in_h * in_w)]);
+    if (1) {  // in->layout() == Tensor::nhwc)
+      // iterate over yt
+      for (uint16_t yh = 0; yh < in_h; yh++) {
+        for (uint16_t yw = 0; yw < in_w; yw++) {
+          for (uint16_t ic = 0; ic < channel; ic++) {
+            int idx = (yw + yh * in_w) * channel + ic;  // (ic*in_h*in_w) + (in_w*yh) + yw;
+            float delta = inPtr[idx] / kk;
+            for (int32_t kh = 0; kh < win_h; kh++) {
+              int xh = yh * stride_h + kh - pad_h;
+              if ((xh < 0) || (xh >= output_h)) {
+                continue;
+              }
+              for (int32_t kw = 0; kw < win_w; kw++) {
+                int xw = yw * stride_w + kw - pad_w;
+                if ((xw < 0) || (xw >= output_w)) {
+                  continue;
+                }
+                // out[(ic*output_h*output_w) + (xh*output_w) + xw] += delta;
+                out[(xw + output_w * xh) * channel + ic] += delta;
+              }
+            }
+          }
+        }
+      }
+    } else {  // nchw
+      for (uint16_t ic = 0; ic < channel; ic++) {
+        // iterate over yt
+        for (uint16_t yh = 0; yh < in_h; yh++) {
+          for (uint16_t yw = 0; yw < in_w; yw++) {
+            int idx = (ic * in_h * in_w) + (in_w * yh) + yw;
+            float delta = inPtr[idx] / kk;
+            for (int32_t kh = 0; kh < win_h; kh++) {
+              int xh = yh * stride_h + kh - pad_h;
+              if ((xh < 0) || (xh >= output_h)) {
+                continue;
+              }
+              for (int32_t kw = 0; kw < win_w; kw++) {
+                int xw = yw * stride_w + kw - pad_w;
+                if ((xw < 0) || (xw >= output_w)) {
+                  continue;
+                }
+                out[(ic * output_h * output_w) + (xh * output_w) + xw] += delta;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void MaxPoolingGrad(const float *dy, const int *indices, float *output_ptr, PoolingParameter *pooling_param) {
+  // int stride_w = pooling_param->stride_w_;
+  // int stride_h = pooling_param->stride_h_;
+  // int pad_w = pooling_param->pad_l_;
+  // int pad_h = pooling_param->pad_u_;
+  // int win_w = pooling_param->window_w_;
+  // int win_h = pooling_param->window_h_;
+  int channel = pooling_param->input_channel_;
+  int in_w = pooling_param->input_w_;
+  int in_h = pooling_param->input_h_;
+  int output_w = pooling_param->output_w_;
+  int output_h = pooling_param->output_h_;
+  int output_batch = pooling_param->output_batch_;
+
+  int out_img_size =
+    output_h * output_w;  // Emir -- in original code this varible is calculated according to input size ??
+  int ind_img_size = in_h * in_w;
+  // const int w_pad = (output_w + pad_w + pad_w);
+
+  for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0;
+
+  const float *yt = reinterpret_cast<const float *>(dy);
+  const int *pos = reinterpret_cast<const int *>(indices);
+  float *out;
+
+  if (1) {  // grads->layout() == Tensor::nhwc)
+    for (int ib = 0; ib < output_batch; ib++) {
+      out = &(output_ptr[ib * output_w * output_w * channel]);
+      for (int ix = 0; ix < ind_img_size; ix++) {
+        for (int cix = 0; cix < channel; cix++) {
+          int idx = (*pos) * channel + cix;
+          out[idx] += *yt;
+          pos++;
+          yt++;
+        }
+      }
+    }
+  } else {
+    for (int ib = 0; ib < output_batch; ib++) {
+      out = &output_ptr[(ib * out_img_size)];
+      for (int cix = 0; cix < channel; cix++) {
+        for (int ix = 0; ix < ind_img_size; ix++) {
+          int idx = cix * output_h * output_w + *pos;  // cord_y*output_w + cord_x;
+          out[idx] += *yt;
+          pos++;
+          yt++;
+        }
+      }
+    }
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..750530d767c5742a671b1151f1f11c086975faf5
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_POOLING_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_POOLING_GRAD_H_
+
+#include "src/runtime/kernel/arm/opclib/fp32/pooling.h"
+
+void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param);
+void MaxPoolingGrad(const float *dy, const int *indices_ptr, float *output_ptr, PoolingParameter *pooling_param);
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_POOLING_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24e7189b3ef8108e64a8ae8f7803c731b67a3844
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.cc
@@ -0,0 +1,130 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstddef>
+#include <algorithm>
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.h"
+
+static inline bool NextIndex(const int num_dims, const int *dims, int *current) {
+  int carry = 1;
+  for (int idx = num_dims - 1; idx >= 0; --idx) {
+    int current_val = current[idx] + carry;
+    if (dims[idx] == current_val) {
+      current[idx] = 0;
+    } else {
+      current[idx] = current_val;
+      carry = 0;
+      break;
+    }
+  }
+  return (carry == 0);
+}
+
+static inline size_t GetInputOffset(const int num_dims, const int *dims, const int *iter) {
+  size_t offset = 0;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    offset = offset * (size_t)(dims[idx]) + (size_t)(iter[idx]);
+  }
+
+  return offset;
+}
+
+static inline size_t GetOutputOffset(const int num_dims, const int *dims, const int *iter, const int num_axis,
+                                     const int *axes) {
+  size_t offset = 0;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    // if we need to skip this axis
+    bool is_axis = false;
+    for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
+      if (idx == axes[axis_idx]) {
+        is_axis = true;
+        break;
+      }
+    }
+
+    if (!is_axis) {
+      offset = offset * (size_t)(dims[idx]) + (size_t)(iter[idx]);
+    }
+  }
+  return offset;
+}
+
+void ReduceMeanByAxes(const float *input_data, int *input_iter, const int *input_dims, int input_num_dims,
+                      const int *axes, int num_axes, float *output_data, const int *output_dims, int output_num_dims) {
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = (size_t)(output_dims[idx]);
+    num_outputs *= current;
+  }
+
+  // Reset input iterator.
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    input_iter[idx] = 0;
+  }
+  // Iterate through input_data.
+  do {
+    size_t input_offset = GetInputOffset(input_num_dims, input_dims, input_iter);
+    size_t output_offset = GetOutputOffset(input_num_dims, input_dims, input_iter, num_axes, axes);
+    output_data[output_offset] += input_data[input_offset];
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  size_t num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_axes; ++idx) {
+    size_t current = (size_t)(input_dims[axes[idx]]);
+    num_elements_in_axis *= current;
+  }
+
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = output_data[idx] / static_cast<float>(num_elements_in_axis);
+  }
+}
+
+float ReduceMeanAll(const float *src, int size) {
+  float sum = 0;
+  for (int i = 0; i < size; ++i) {
+    sum += src[i];
+  }
+  return sum / size;
+}
+
+void ReduceSumByAxes(const float *input, const int *input_dims, float *output, const int *output_dims, int num_dims) {
+  int num_outputs = 1;
+  int same_shape = true;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    num_outputs *= output_dims[idx];
+    if (output_dims[idx] != input_dims[idx]) same_shape = false;
+  }
+  if (same_shape) {
+    std::copy(input, input + num_outputs * sizeof(float), output);
+    // memcpy(output, input, num_outputs*sizeof(float));
+    return;
+  }
+
+  for (int idx = 0; idx < num_outputs; ++idx) output[idx] = 0;  // zero output
+
+  int input_iter[8] = {0};
+  int axes[5] = {0};
+  int num_axes = 0;
+  for (int i = 0; i < num_dims; i++)
+    if (output_dims[i] == 1) axes[num_axes++] = i;
+
+  // Iterate through input_data.
+  do {
+    size_t input_offset = GetInputOffset(num_dims, input_dims, input_iter);
+    size_t output_offset = GetOutputOffset(num_dims, input_dims, input_iter, num_axes, axes);
+    output[output_offset] += input[input_offset];
+  } while (NextIndex(num_dims, input_dims, input_iter));
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..76458171357182f536ac2f6b69daf14a1ae08ecc
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.h
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_REDUCE_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_REDUCE_GRAD_H_
+
+float ReduceMeanAll(const float *src, int size);
+void ReduceSumByAxes(const float *input, const int *input_dims, float *output, const int *output_dims, int num_dims);
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_REDUCE_GRAD_H_
+
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/softmax_grad.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/softmax_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fb7fc1580b0510e4fb80c7b6827377a382cde0c
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/softmax_grad.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_SOFTMAX_GRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_SOFTMAX_GRAD_H_
+
+#include "src/runtime/kernel/arm/opclib/op_base.h"
+
+struct SoftmaxCrossEntropyParameter {
+    OpParameter op_parameter;
+    int32_t batch_size_;
+    unsigned int number_of_classes_;
+    int n_dim_;
+    int input_shape_[5];
+};
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_SOFTMAX_GRAD_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb6cd1ce6b6de8eabcf434c825a22abb12e5da84
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.cc
@@ -0,0 +1,176 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+#include "src/runtime/kernel/arm/opclib/pack_ext.h"
+
+static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); }
+
+void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) {
+  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
+  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
+  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
+  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;
+
+  const int stride_h = conv_param->stride_h_;
+  const int stride_w = conv_param->stride_w_;
+
+  const int dilation_h = conv_param->dilation_h_;
+  const int dilation_w = conv_param->dilation_w_;
+
+  const int kernel_h = conv_param->kernel_h_;
+  const int kernel_w = conv_param->kernel_w_;
+
+  const int in_height = conv_param->input_h_;
+  const int in_width = conv_param->input_w_;
+
+  const int output_h = conv_param->output_h_;
+  const int output_w = conv_param->output_w_;
+  const int channels = conv_param->input_channel_ / conv_param->group_;
+  const int tot_channels = conv_param->input_channel_;
+
+  int /*channel,*/ kernel_row, kernel_col, output_rows, output_col;
+
+  int row_stride_offset = 0;
+
+  for (output_rows = output_h; output_rows; output_rows--) {
+    int col_stride_offset = 0;
+    for (output_col = output_w; output_col; output_col--) {
+      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
+        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
+
+          if (is_a_ge_zero_and_a_lt_b(input_row, in_height) && is_a_ge_zero_and_a_lt_b(input_col, in_width)) {
+            const int offset = (input_row * in_width + input_col) * tot_channels;
+            memcpy(data_col, in_data + offset, sizeof(float) * channels);
+            data_col += channels;
+          } else {
+            memset(data_col, 0, sizeof(float) * channels);
+            data_col += channels;
+          }
+        }
+      }
+      col_stride_offset += stride_w;
+    }
+    row_stride_offset += stride_h;
+  }
+}
+
+// output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w)
+void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) {
+  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
+  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
+  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
+  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;
+
+  const int stride_h = conv_param->stride_h_;
+  const int stride_w = conv_param->stride_w_;
+
+  const int dilation_h = conv_param->dilation_h_;
+  const int dilation_w = conv_param->dilation_w_;
+
+  const int kernel_h = conv_param->kernel_h_;
+  const int kernel_w = conv_param->kernel_w_;
+
+  const int in_height = conv_param->input_h_;
+  const int in_width = conv_param->input_w_;
+
+  const int output_h = conv_param->output_h_;
+  const int output_w = conv_param->output_w_;
+  const int channels = conv_param->input_channel_ / conv_param->group_;
+  const int tot_channels = conv_param->input_channel_;
+
+  int channel, kernel_row, kernel_col, output_rows, output_col;
+
+  for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+    for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+      for (channel = 0; channel < channels; channel++) {
+        int input_row = -pad_up + kernel_row * dilation_h;
+        for (output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, in_height)) {
+            for (output_col = output_w; output_col; output_col--) {
+              *(data_row++) = 0;
+            }
+          } else {
+            int input_col = -pad_left + kernel_col * dilation_w;
+            for (output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, in_width)) {
+                const int offset = (input_row * in_width + input_col) * tot_channels + channel;
+                *(data_row++) = in_data[offset];
+              } else {
+                *(data_row++) = 0;
+              }
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+void col2im_hwc(const float *data_col, float *data_im, ConvParameter *conv_param) {
+  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
+  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
+  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
+  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;
+
+  const int stride_h = conv_param->stride_h_;
+  const int stride_w = conv_param->stride_w_;
+
+  const int dilation_h = conv_param->dilation_h_;
+  const int dilation_w = conv_param->dilation_w_;
+
+  const int kernel_h = conv_param->kernel_h_;
+  const int kernel_w = conv_param->kernel_w_;
+
+  const int in_height = conv_param->input_h_;
+  const int in_width = conv_param->input_w_;
+
+  const int output_h = conv_param->output_h_;
+  const int output_w = conv_param->output_w_;
+  const int channels = conv_param->input_channel_ / conv_param->group_;
+  const int tot_channels = conv_param->input_channel_;
+
+  int kernel_row, kernel_col, output_rows, output_col;
+
+  int row_stride_offset = 0;
+
+  for (output_rows = output_h; output_rows; output_rows--) {
+    int col_stride_offset = 0;
+    for (output_col = output_w; output_col; output_col--) {
+      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+        int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
+        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+          int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;
+
+          if (is_a_ge_zero_and_a_lt_b(input_row, in_height) && is_a_ge_zero_and_a_lt_b(input_col, in_width)) {
+            int offset = (input_row * in_width + input_col) * tot_channels;
+            float *data_im_ptr = &data_im[offset];
+            for (int i = 0; i < channels; i++) {
+              data_im_ptr[i] += data_col[i];
+            }
+          }
+          data_col += channels;
+        }
+      }
+      col_stride_offset += stride_w;
+    }
+    row_stride_offset += stride_h;
+  }
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.h b/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f51aae13d73cc6b70d4dc8e77e09cca59401e59
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_EXT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_EXT_H_
+
+#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
+
+void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param);
+void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param);
+void col2im_hwc(const float *data_col, float *data_im, ConvParameter *conv_param);
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_EXT_H
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index ad0ef9619546cf7da26498b84203406be118295f..0bfa4b1dbd34c68d5a40701af2aedc89026b8769 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -152,6 +152,7 @@ set(TEST_LITE_SRC
         ${LITE_DIR}/src/scheduler.cc
         ${LITE_DIR}/src/common/graph_util.cc
         ${LITE_DIR}/src/common/file_utils.cc
+        ${LITE_DIR}/src/common/file_utils_ext.cc
         ${LITE_DIR}/src/common/utils.cc
         ${LITE_DIR}/tools/common/graph_util.cc
         ${LITE_DIR}/tools/common/tensor_util.cc
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_grad_fp32_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1badd29a267daafeb2eff45a32f8de418398dcce
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_grad_fp32_tests.cc
@@ -0,0 +1,312 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "src/common/file_utils.h"
+#include "src/common/file_utils_ext.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "mindspore/lite/src/ir/tensor.h"
+#include "mindspore/lite/src/lite_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.h"
+
+namespace mindspore {
+class TestActGradFp32 :  public mindspore::Common {
+ public:
+  TestActGradFp32() {}
+};
+
+TEST_F(TestActGradFp32, ReluGradFp32) {
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = 50;
+
+  size_t input_size;
+  std::string input_path = "./test_data/activationGrad/relu_y_50.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::string yt_path = "./test_data/activationGrad/relu_yt_50.bin";
+  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    ReluGrad(yt_data, input_data, 50, output_data);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    ReluGrad(yt_data, input_data, 50, output_data);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/activationGrad/relu_out_50.bin";
+
+  int res = lite::CompareRelativeOutput(output_data, output_path);
+
+  EXPECT_EQ(res, 0);
+
+  delete input_data;
+  delete[] output_data;
+  delete yt_data;
+
+  MS_LOG(INFO) << "ReluGradFp32 passed";
+}
+
+TEST_F(TestActGradFp32, Relu6GradFp32) {
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = 50;
+
+  size_t input_size;
+  std::string input_path = "./test_data/activationGrad/relu6_y_50.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::string yt_path = "./test_data/activationGrad/relu6_yt_50.bin";
+  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    Relu6Grad(yt_data, input_data, 50, output_data);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    Relu6Grad(yt_data, input_data, 50, output_data);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/activationGrad/relu6_out_50.bin";
+  int res = lite::CompareRelativeOutput(output_data, output_path);
+
+  EXPECT_EQ(res, 0);
+
+  delete input_data;
+  delete[] output_data;
+  delete yt_data;
+
+  MS_LOG(INFO) << "Relu6GradFp32 passed";
+}
+
+TEST_F(TestActGradFp32, LReluGradFp32) {
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = 50;
+
+  size_t input_size;
+  std::string input_path = "./test_data/activationGrad/lrelu_y_50.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::string yt_path = "./test_data/activationGrad/lrelu_yt_50.bin";
+  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    LReluGrad(yt_data, input_data, 50, output_data, 0.1);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    LReluGrad(yt_data, input_data, 50, output_data, 0.1);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/activationGrad/lrelu_out_50.bin";
+  int res = lite::CompareRelativeOutput(output_data, output_path);
+
+  EXPECT_EQ(res, 0);
+
+  delete input_data;
+  delete[] output_data;
+  delete yt_data;
+
+  MS_LOG(INFO) << "LReluGradFp32 passed";
+}
+
+TEST_F(TestActGradFp32, SigmoidGradFp32) {
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = 50;
+
+  size_t input_size;
+  std::string input_path = "./test_data/activationGrad/sigmoid_y_50.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::string yt_path = "./test_data/activationGrad/sigmoid_yt_50.bin";
+  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    SigmoidGrad(yt_data, input_data, 50, output_data);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    SigmoidGrad(yt_data, input_data, 50, output_data);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/activationGrad/sigmoid_out_50.bin";
+  int res = lite::CompareRelativeOutput(output_data, output_path);
+
+  EXPECT_EQ(res, 0);
+  // lite::CompareOutput(output_data, output_path);
+
+  delete input_data;
+  delete[] output_data;
+  delete yt_data;
+
+  MS_LOG(INFO) << "SigmoidGradFp32 passed";
+}
+
+TEST_F(TestActGradFp32, tanhGradFp32) {
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = 50;
+
+  size_t input_size;
+  std::string input_path = "./test_data/activationGrad/tanh_y_50.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::string yt_path = "./test_data/activationGrad/tanh_yt_50.bin";
+  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    TanhGrad(yt_data, input_data, 50, output_data);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    TanhGrad(yt_data, input_data, 50, output_data);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/activationGrad/tanh_out_50.bin";
+  int res = lite::CompareRelativeOutput(output_data, output_path);
+
+  EXPECT_EQ(res, 0);
+
+  delete input_data;
+  delete[] output_data;
+  delete yt_data;
+  MS_LOG(INFO) << "TanhGradFp32 passed";
+}
+
+TEST_F(TestActGradFp32, hswishGradFp32) {
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = 50;
+
+  size_t input_size;
+  std::string input_path = "./test_data/activationGrad/hswish_x_50.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::string yt_path = "./test_data/activationGrad/hswish_yt_50.bin";
+  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    HSwishGrad(yt_data, input_data, 50, output_data);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    HSwishGrad(yt_data, input_data, 50, output_data);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/activationGrad/hswish_out_50.bin";
+  int res = lite::CompareRelativeOutput(output_data, output_path);
+
+  EXPECT_EQ(res, 0);
+
+  delete input_data;
+  delete[] output_data;
+  delete yt_data;
+  MS_LOG(INFO) << "hswishGradFp32 passed";
+}
+
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_grad_fp32_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48d5c93e563447c34edb766e24082ec592888e36
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_grad_fp32_tests.cc
@@ -0,0 +1,497 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <memory>
+#include <vector>
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "src/common/file_utils.h"
+#include "src/common/file_utils_ext.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce.h"
+#include "mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.h"
+#include "mindspore/lite/src/kernel_registry.h"
+
+namespace mindspore {
+
+class TestArithmeticGradFp32 : public mindspore::Common {
+ public:
+  TestArithmeticGradFp32() {}
+};
+
+std::vector<lite::tensor::Tensor *> GenerateTensorsForTest(const char *test, int test_id) {
+  size_t input_size;
+  std::vector<int> large_dim({4, 6});
+  std::vector<int> small_dim({6});
+  int large_size = (4 * 6);
+  int small_size = (1 * 6);
+  char *dx1_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_1_x1_4_6.bin");
+  char *dx2_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_1_x2_1_6.bin");
+
+  if (test_id == 7) {
+    large_dim = std::vector<int>({4, 5, 6});
+    small_dim = std::vector<int>({6});
+    large_size = (4 * 5 * 6);
+    small_size = (6);
+    dx1_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_7_x1_4_5_6.bin");
+    dx2_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_7_x2_1_1_6.bin");
+  }
+  if (test_id >= 8) {
+    large_dim = std::vector<int>({5, 4, 6});
+    small_dim = std::vector<int>({5, 1, 6});
+    large_size = (4 * 5 * 6);
+    small_size = (5 * 6);
+    dx1_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_8_x1_5_4_6.bin");
+    dx2_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_8_x2_5_1_6.bin");
+  }
+
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(test, &input_size));
+  lite::tensor::Tensor *dy_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, large_dim);
+  dy_tensor->SetData(dy_data);
+
+  auto x1_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dx1_file, &input_size));
+  lite::tensor::Tensor *x1_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, large_dim);
+  x1_tensor->SetData(x1_data);
+
+  auto x2_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dx2_file, &input_size));
+  lite::tensor::Tensor *x2_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, small_dim);
+  x2_tensor->SetData(x2_data);
+
+  auto dx1_data = new float[large_size];
+  lite::tensor::Tensor *dx1_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, large_dim);
+  dx1_tensor->SetData(dx1_data);
+
+  auto dx2_data = new float[small_size];
+  lite::tensor::Tensor *dx2_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, small_dim);
+  dx2_tensor->SetData(dx2_data);
+
+  std::vector<lite::tensor::Tensor *> ret_vector = {dy_tensor, x1_tensor, x2_tensor, dx1_tensor, dx2_tensor};
+  return ret_vector;
+}
+
+TEST_F(TestArithmeticGradFp32, TestAddGradFp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_AddGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_1_dy_4_6.bin", 1);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_AddGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_1_dx1_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_1_dx2_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestAddGradFp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestAddGrad2Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_AddGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_1_dy_4_6.bin", 1);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_AddGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_1_dx1_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_1_dx2_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestAddGrad2Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestAddGrad3Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_AddGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_8_dy_5_4_6.bin", 8);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_AddGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_8_dx2_5_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_8_dx1_5_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestAddGrad3Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestSubGradFp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_SubGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_2_dy_4_6.bin", 2);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_SubGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_2_dx1_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_2_dx2_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestSubGradFp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestSubGrad2Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_SubGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_3_dy_4_6.bin", 3);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_SubGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_3_dx1_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_3_dx2_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestSubGrad2Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestMulGradFp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_MulGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_4_dy_4_6.bin", 4);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+
+  int loop_count = 1000;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    kernel_obj->Run();
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  printf("total cost (for %d loops): %lu us\n", loop_count, cost);
+  // auto time_avg = cost / loop_count;
+  // printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_4_dx1_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_4_dx2_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestMulGradFp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestMulGrad2Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_MulGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_4_dy_4_6.bin", 4);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_4_dx1_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_4_dx2_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestMulGrad2Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestMulGrad3Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_MulGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_9_dy_5_4_6.bin", 9);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_9_dx1_5_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestMulGrad3Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestMulGrad4Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_MulGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_9_dy_5_4_6.bin", 9);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_9_dx1_5_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestMulGrad4Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestDivGradFp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_DivGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_5_dy_4_6.bin", 5);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_5_dx1_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_5_dx2_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestDivGradFp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestDivGrad2Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_DivGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_6_dy_4_6.bin", 6);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string dx2_path = "./test_data/operators/arithmetic_fp32_6_dx2_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), dx2_path));
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_6_dx1_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestDivGrad2Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, TestDivGrad3Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_DivGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_10_dy_5_4_6.bin", 10);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string dx1_path = "./test_data/operators/arithmetic_fp32_10_dx1_5_4_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), dx1_path));
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestDivGrad3Fp32 passed";
+}
+
+TEST_F(TestArithmeticGradFp32, Test3DDivGrad2Fp32) {
+  auto param = new ArithmeticParameter();
+  param->op_parameter_.type_ = PrimitiveType_DivGrad;
+  std::vector<lite::tensor::Tensor *> all_tensors =
+    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_7_dy_4_5_6.bin", 7);
+
+  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
+  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
+  kernel_obj->Run();
+
+  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
+  printf("==================output data=================\n");
+  for (int i = 0; i < 6; i++) {
+    std::cout << output_ptr[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  std::string dx1_path = "./test_data/operators/arithmetic_fp32_7_dx1_4_5_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), dx1_path));
+
+  std::string output_path = "./test_data/operators/arithmetic_fp32_7_dx2_1_1_6.bin";
+  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));
+
+  for (int i = 0; i < 5; i++) delete all_tensors[i];
+  delete param;
+  MS_LOG(INFO) << "TestDivGrad2Fp32 passed";
+}
+
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/bias_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/bias_grad_fp32_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c26e9502291b181e45f259acd55d4f7c696847e
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/bias_grad_fp32_tests.cc
@@ -0,0 +1,71 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <memory>
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "src/common/file_utils.h"
+#include "mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.h"
+#include "mindspore/lite/src/kernel_registry.h"
+
+namespace mindspore {
+
+class TestBiasGradFp32 : public mindspore::Common {
+ public:
+  TestBiasGradFp32() {}
+};
+
+TEST_F(TestBiasGradFp32, BiasGradFp32) {
+  // prepare stage
+  auto bias_param = new ArithmeticParameter();
+
+  size_t input_size;
+  std::string input_path = "./test_data/operators/biasgradfp32_1_dy_10_28_28_7.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::vector<int> dim_dy({10, 28, 28, 7});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(input_data);
+
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor};
+
+  auto output_data = new float[7];
+  std::vector<int> dim_dw({7});
+  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
+  dw_tensor.SetData(output_data);
+  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_BiasGrad};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(bias_param), NULL, desc);
+
+  kernel_obj->Run();
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 7; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+  std::string output_path = "./test_data/operators/biasgradfp32_1_db_7.bin";
+  lite::CompareOutput(output_data, output_path);
+
+  // delete input_data;
+  // delete[] output_data;
+  delete bias_param;
+  MS_LOG(INFO) << "BiasGradFp32 passed";
+}
+
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_grad_fp32_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef501fa65537411f547db122df6ac8e99a18916c
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_grad_fp32_tests.cc
@@ -0,0 +1,521 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include <vector>
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "src/common/file_utils.h"
+#include "src/common/file_utils_ext.h"
+#include "mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.h"
+#include "mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/conv_parameter.h"
+#include "mindspore/lite/src/kernel_registry.h"
+
+namespace mindspore {
+class TestConvolutionGradFp32 :  public mindspore::Common {
+ public:
+  TestConvolutionGradFp32() {}
+};
+
+void InitConvParamGroup1FP32(ConvParameter *conv_param) {
+  conv_param->input_batch_ = 1;
+  conv_param->input_h_ = 28;
+  conv_param->input_w_ = 28;
+  conv_param->input_channel_ = 3;
+
+  conv_param->output_batch_ = 1;
+  conv_param->output_h_ = 28;
+  conv_param->output_w_ = 28;
+  conv_param->output_channel_ = 32;
+
+  conv_param->kernel_h_ = 3;
+  conv_param->kernel_w_ = 3;
+
+  conv_param->stride_h_ = 1;
+  conv_param->stride_w_ = 1;
+
+  conv_param->dilation_h_ = 1;
+  conv_param->dilation_w_ = 1;
+
+  conv_param->pad_h_ = 1;
+  conv_param->pad_w_ = 1;
+
+  conv_param->group_ = 1;
+  conv_param->is_relu_ = false;
+  conv_param->is_relu6_ = false;
+  conv_param->thread_num_ = 1;
+}
+
+void InitConvParamGroup3FP32(ConvParameter *conv_param) {
+  InitConvParamGroup1FP32(conv_param);
+  conv_param->group_ = 3;
+  conv_param->output_channel_ = 18;
+}
+
+void InitConvParamGroup3Dilation2FP32(ConvParameter *conv_param) {
+  InitConvParamGroup3FP32(conv_param);
+  conv_param->dilation_h_ = 2;
+  conv_param->dilation_w_ = 2;
+  conv_param->output_h_ = 26;
+  conv_param->output_w_ = 26;
+}
+
+TEST_F(TestConvolutionGradFp32, ConvFp32FilterGrad) {
+  // prepare stage
+  auto conv_param = new ConvParameter();
+  InitConvParamGroup1FP32(conv_param);
+
+  size_t dy_size;
+  std::string dy_path = "./test_data/conv/convfp32_dy_1_28_28_32.bin";
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
+  std::vector<int> dim_dy({1, 28, 28, 32});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(dy_data);
+
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size =
+    conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;
+
+  size_t input_size;
+  std::string input_path = "./test_data/conv/convfp32_x_1_28_28_3.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::vector<int> dim_x({1, 28, 28, 3});
+  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
+  x_tensor.SetData(input_data);
+
+  auto dw_data = new float[output_data_size];
+  std::vector<int> dim_dw({32, 3, 3, 3});
+  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
+  dw_tensor.SetData(dw_data);
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};
+  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradFilter};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);
+
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    kernel->Run();
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    kernel->Run();
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  std::string output_path = "./test_data/conv/convfp32_dw_32_3_3_3.bin";
+  auto res = lite::CompareRelativeOutput(dw_data, output_path);
+
+  EXPECT_EQ(res, 0);
+
+  // delete input_data;
+  // delete dy_data;
+  // delete [] dw_data;
+  delete kernel;
+  delete conv_param;
+  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
+}
+
+TEST_F(TestConvolutionGradFp32, ConvFp32InputGrad) {
+  // prepare stage
+  auto conv_param = new ConvParameter();
+  InitConvParamGroup1FP32(conv_param);
+
+  size_t dy_size;
+  std::string dy_path = "./test_data/conv/convfp32_dy_1_28_28_32.bin";
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
+  std::vector<int> dim_dy({1, 28, 28, 32});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(dy_data);
+
+  size_t w_size;
+  std::string w_path = "./test_data/conv/convfp32_w_32_3_3_3.bin";
+  auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
+  std::vector<int> dim_dw({32, 3, 3, 3});
+  lite::tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_dw);
+  w_tensor.SetData(w_data);
+
+  size_t output_data_size =
+    conv_param->input_batch_ * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
+  auto dx_data = new float[output_data_size];
+  std::vector<int> dim_dx({1, 28, 28, 3});
+  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
+  dx_tensor.SetData(dx_data);
+
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &w_tensor};
+  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
+  // runtime part
+
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradInput};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);
+
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    kernel->Run();
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    kernel->Run();
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  std::string output_path = "./test_data/conv/convfp32_dx_1_28_28_3.bin";
+  auto res = lite::CompareRelativeOutput(dx_data, output_path);
+  EXPECT_EQ(res, 0);
+
+  delete kernel;
+  delete conv_param;
+  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
+}
+
+TEST_F(TestConvolutionGradFp32, ConvFp32GroupFilterGrad) {
+  // prepare stage
+  auto conv_param = new ConvParameter();
+  InitConvParamGroup3FP32(conv_param);
+
+  size_t dy_size;
+  std::string dy_path = "./test_data/conv/convfp32_dy_g3_1_28_28_18.bin";
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
+  std::vector<int> dim_dy({1, 28, 28, 18});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(dy_data);
+
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
+                            conv_param->input_channel_ / conv_param->group_;
+
+  size_t input_size;
+  std::string input_path = "./test_data/conv/convfp32_x_g3_1_28_28_3.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::vector<int> dim_x({1, 28, 28, 3});
+  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
+  x_tensor.SetData(input_data);
+
+  auto dw_data = new float[output_data_size];
+  std::vector<int> dim_dw({18, 3, 3, 1});
+  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
+  dw_tensor.SetData(dw_data);
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};
+  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradFilter};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);
+
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    kernel->Run();
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    kernel->Run();
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  std::string output_path = "./test_data/conv/convfp32_dw_g3_18_3_3_3.bin";
+  auto res = lite::CompareRelativeOutput(dw_data, output_path);
+  EXPECT_EQ(res, 0);
+
+  // delete input_data;
+  // delete dy_data;
+  // delete [] dw_data;
+  delete kernel;
+  delete conv_param;
+  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
+}
+
+TEST_F(TestConvolutionGradFp32, ConvFp32GroupInputGrad) {
+  // prepare stage
+  auto conv_param = new ConvParameter();
+  InitConvParamGroup3FP32(conv_param);
+
+  size_t dy_size;
+  std::string dy_path = "./test_data/conv/convfp32_dy_g3_1_28_28_18.bin";
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
+  std::vector<int> dim_dy({1, 28, 28, 18});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(dy_data);
+
+  size_t w_size;
+  std::string w_path = "./test_data/conv/convfp32_w_g3_18_3_3_3.bin";
+  auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
+  std::vector<int> dim_dw({18, 3, 3, 1});
+  lite::tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_dw);
+  w_tensor.SetData(w_data);
+
+  size_t output_data_size =
+    conv_param->input_batch_ * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
+  auto dx_data = new float[output_data_size];
+  std::vector<int> dim_dx({1, 28, 28, 3});
+  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
+  dx_tensor.SetData(dx_data);
+
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &w_tensor};
+  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
+  // runtime part
+
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradInput};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);
+
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    kernel->Run();
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    kernel->Run();
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  std::string output_path = "./test_data/conv/convfp32_dx_g3_1_28_28_3.bin";
+  auto res = lite::CompareRelativeOutput(dx_data, output_path);
+  EXPECT_EQ(res, 0);
+
+  delete kernel;
+  delete conv_param;
+  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
+}
+
+TEST_F(TestConvolutionGradFp32, ConvFp32GroupDilationFilterGrad) {
+  // prepare stage
+  auto conv_param = new ConvParameter();
+
+  InitConvParamGroup3Dilation2FP32(conv_param);
+
+  size_t dy_size;
+  std::string dy_path = "./test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin";
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
+  std::vector<int> dim_dy({1, 26, 26, 18});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(dy_data);
+
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
+                            conv_param->input_channel_ / conv_param->group_;
+
+  size_t input_size;
+  std::string input_path = "./test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::vector<int> dim_x({1, 28, 28, 3});
+  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
+  x_tensor.SetData(input_data);
+
+  auto dw_data = new float[output_data_size];
+  std::vector<int> dim_dw({18, 3, 3, 1});
+  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
+  dw_tensor.SetData(dw_data);
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};
+  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradFilter};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);
+
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    kernel->Run();
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    kernel->Run();
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  std::string output_path = "./test_data/conv/convfp32_dw_g3_d2_18_3_3_3.bin";
+  auto res = lite::CompareRelativeOutput(dw_data, output_path);
+  EXPECT_EQ(res, 0);
+  // delete input_data;
+  // delete dy_data;
+  // delete [] dw_data;
+  delete kernel;
+  delete conv_param;
+  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
+}
+
+TEST_F(TestConvolutionGradFp32, ConvFp32GroupDilationInputGrad) {
+  // prepare stage
+  auto conv_param = new ConvParameter();
+  InitConvParamGroup3Dilation2FP32(conv_param);
+
+  size_t dy_size;
+  std::string dy_path = "./test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin";
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
+  std::vector<int> dim_dy({1, 26, 26, 18});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(dy_data);
+
+  size_t w_size;
+  std::string w_path = "./test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin";
+  auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
+  std::vector<int> dim_w({18, 3, 3, 1});
+  lite::tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_w);
+  w_tensor.SetData(w_data);
+
+  size_t output_data_size =
+    conv_param->input_batch_ * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
+  auto dx_data = new float[output_data_size];
+  std::vector<int> dim_dx({1, 28, 28, 3});
+  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
+  dx_tensor.SetData(dx_data);
+
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &w_tensor};
+  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
+  // runtime part
+
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradInput};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);
+
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    kernel->Run();
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    kernel->Run();
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  std::string output_path = "./test_data/conv/convfp32_dx_g3_d2_1_28_28_3.bin";
+  auto res = lite::CompareRelativeOutput(dx_data, output_path);
+  EXPECT_EQ(res, 0);
+
+  delete kernel;
+  delete conv_param;
+  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
+}
+
+// TEST_F(TestConvolutionGradFp32, ConvGroupDilation) {
+//   // prepare stage
+//   auto conv_param = new ConvParameter();
+//   InitConvParamGroup3Dilation2FP32(conv_param);
+
+//   size_t x_size;
+//   std::string x_path = "./test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin";
+//   auto x_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(x_path.c_str(), &x_size));
+//   std::vector<int> dim_x({1, 28, 28, 3});
+//   tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
+//   x_tensor.SetData(x_data);
+
+//   size_t w_size;
+//   std::string w_path = "./test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin";
+//   auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
+//   std::vector<int> dim_w({18, 3, 3, 1});
+//   tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_w);
+//   w_tensor.SetData(w_data);
+
+//   size_t output_data_size =
+//     conv_param->output_batch_ * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
+//   auto y_data = new float[output_data_size];
+//   std::vector<int> dim_y({1, 26, 26, 18});
+//   tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
+//   y_tensor.SetData(y_data);
+
+//   std::vector<tensor::Tensor *> inputs = {&x_tensor, &w_tensor};
+//   std::vector<tensor::Tensor *> outputs = {&y_tensor};
+//   // runtime part
+
+//   printf("Calculating runtime cost...\n");
+//   uint64_t time_avg = 0;
+
+//   lite::Context context;
+//   ;
+//   context.deviceCtx.type = lite::DT_CPU;
+//   context.threadNum = 1;
+
+//   kernel::KernelKey desc = {kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Conv2D};
+//   auto creator = lite::KernelRegistry::GetInstance()->GetKernelCreator(desc);
+//   auto kernel = creator(inputs, outputs, (OpParameter *)conv_param, &context, desc);
+
+//   kernel->train();
+//   EXPECT_EQ(kernel->is_train(), 1);
+
+//   // warm up loop
+//   for (int i = 0; i < 3; i++) {
+//     kernel->Run();
+//   }
+
+//   int loop_count = 100;
+//   auto time_start = mindspore::lite::GetTimeUs();
+//   for (int i = 0; i < loop_count; i++) {
+//     kernel->Run();
+//   }
+//   auto time_end = mindspore::lite::GetTimeUs();
+//   auto cost = time_end - time_start;
+//   time_avg = cost / loop_count;
+//   printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+//   std::string output_path = "./test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin";
+//   auto res = lite::CompareRelativeOutput(y_data, output_path);
+//   EXPECT_EQ(res, 0);
+
+//   delete kernel;
+//   delete conv_param;
+
+//   MS_LOG(INFO) << "TestConvolutionFp32 Filter Grad passed";
+// }
+
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pooling_grad_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pooling_grad_fp32_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0dace8d7cdd32f17fb875126af6f0600cb4bfba1
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pooling_grad_fp32_tests.cc
@@ -0,0 +1,332 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <memory>
+#include "mindspore/lite/include/context.h"
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "src/common/utils.h"
+#include "src/common/file_utils.h"
+#include "src/runtime/kernel/arm/fp32/pooling_grad.h"
+#include "src/runtime/kernel/arm/opclib/fp32/pooling_grad.h"
+
+namespace mindspore {
+class TestPoolingGradFp32 :  public mindspore::Common {
+ public:
+  TestPoolingGradFp32() {}
+};
+
+void InitPoolingParamFP32(PoolingParameter *pooling_param) {
+  pooling_param->input_batch_ = 1;
+  pooling_param->input_h_ = 28;
+  pooling_param->input_w_ = 28;
+  pooling_param->input_channel_ = 3;
+
+  pooling_param->output_batch_ = 1;
+  pooling_param->output_h_ = 28;
+  pooling_param->output_w_ = 28;
+  pooling_param->output_channel_ = 32;
+
+  pooling_param->window_h_ = 3;
+  pooling_param->window_w_ = 3;
+
+  pooling_param->stride_h_ = 1;
+  pooling_param->stride_w_ = 1;
+
+  pooling_param->pad_u_ = 1;
+  pooling_param->pad_d_ = 1;
+  pooling_param->pad_l_ = 1;
+  pooling_param->pad_r_ = 1;
+  pooling_param->thread_num_ = 1;
+}
+
+TEST_F(TestPoolingGradFp32, AvgPoolingGradFp32) {
+  // prepare stage
+  auto pooling_param = new PoolingParameter();
+  InitPoolingParamFP32(pooling_param);
+  pooling_param->output_channel_ = 3;
+
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size =
+    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;
+
+  size_t input_size;
+  std::string input_path = "./test_data/pooling/avgpoolgradfp32_1_dy_1_28_28_3.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    AvgPoolingGrad(input_data, output_data, pooling_param);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    AvgPoolingGrad(input_data, output_data, pooling_param);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+  std::string output_path = "./test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin";
+  lite::CompareOutput(output_data, output_path);
+
+  delete input_data;
+  delete[] output_data;
+  delete pooling_param;
+  MS_LOG(INFO) << "TestAvgPoolingGradFp32 passed";
+}
+
+TEST_F(TestPoolingGradFp32, AvgPoolingKernelGradFp32) {
+  // prepare stage
+  auto pooling_param = new PoolingParameter();
+  InitPoolingParamFP32(pooling_param);
+
+  pooling_param->output_channel_ = 3;
+
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  // uint64_t time_avg = 0;
+  size_t output_data_size =
+    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;
+
+  size_t input_size;
+  std::string input_path = "./test_data/pooling/avgpoolgradfp32_1_dy_1_28_28_3.bin";
+  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
+  std::vector<int> dim_dy({1, 28, 28, 3});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(input_data);
+
+  std::string input1_path = "./test_data/pooling/avgpoolgradfp32_1_x_1_28_28_3.bin";
+  input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1_path.c_str(), &input_size));
+  std::vector<int> dim_x({1, 28, 28, 3});
+  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
+  x_tensor.SetData(input_data);
+
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};
+
+  auto output_data = new float[output_data_size];
+  std::vector<int> dim_dx({1, 28, 28, 3});
+  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
+  dx_tensor.SetData(output_data);
+  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
+
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(pooling_param), NULL, desc);
+
+  kernel_obj->Run();
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+  std::string output_path = "./test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin";
+  lite::CompareOutput(output_data, output_path);
+
+  // delete input_data;
+  // delete[] output_data;
+  delete pooling_param;
+  MS_LOG(INFO) << "TestAvgPoolingGradFp32 passed";
+}
+
+TEST_F(TestPoolingGradFp32, MaxPoolingGradFp32) {
+  // prepare stage
+  auto pooling_param = new PoolingParameter();
+  InitPoolingParamFP32(pooling_param);
+  pooling_param->output_channel_ = 3;
+  pooling_param->avg_pooling_ = false;
+  pooling_param->max_pooling_ = true;
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  uint64_t time_avg = 0;
+  size_t output_data_size =
+    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;
+
+  size_t input_size;
+  std::string i_path = "./test_data/pooling/maxpoolgradfp32_1_i_1_28_28_3.bin";
+  auto ill_data = reinterpret_cast<int64_t *>(mindspore::lite::ReadFile(i_path.c_str(), &input_size));
+  auto i_data = new int[output_data_size];
+  for (uint32_t i = 0; i < output_data_size; i++) {
+    i_data[i] = static_cast<int>(ill_data[i]);
+  }
+
+  std::string dy_path = "./test_data/pooling/maxpoolgradfp32_1_dy_1_28_28_3.bin";
+  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &input_size));
+
+  auto output_data = new float[output_data_size];
+  // warm up loop
+  for (int i = 0; i < 3; i++) {
+    MaxPoolingGrad(dy_data, i_data, output_data, pooling_param);
+  }
+
+  int loop_count = 100;
+  auto time_start = mindspore::lite::GetTimeUs();
+  for (int i = 0; i < loop_count; i++) {
+    MaxPoolingGrad(dy_data, i_data, output_data, pooling_param);
+  }
+  auto time_end = mindspore::lite::GetTimeUs();
+  auto cost = time_end - time_start;
+  time_avg = cost / loop_count;
+  printf("single thread running time : %f ms\n", time_avg / 1000.0f);
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+  std::string output_path = "./test_data/pooling/maxpoolgradfp32_1_dx_1_28_28_3.bin";
+  lite::CompareOutput(output_data, output_path);
+
+  // delete input_data;
+  delete pooling_param;
+  delete[] output_data;
+  MS_LOG(INFO) << "TestMaxPoolingGradFp32 passed";
+}
+
+#if 0
+TEST_F(TestPoolingGradFp32, MaxPoolingKernelGradFp32) {
+  // prepare stage
+  auto maxpool = new PoolingParameter();
+  InitPoolingParamFP32(maxpool);
+  maxpool->avg_pooling_ = false;
+  maxpool->max_pooling_ = true;
+  maxpool->input_h_ = 30;
+  maxpool->input_w_ = 30;
+  maxpool->input_channel_ = 3;
+
+  maxpool->output_batch_ = 1;
+  maxpool->output_h_ = 10;
+  maxpool->output_w_ = 10;
+  maxpool->output_channel_ = 3;
+  maxpool->stride_h_ = 3;
+  maxpool->stride_w_ = 3;
+
+  maxpool->pad_u_ = 0;
+  maxpool->pad_d_ = 0;
+  maxpool->pad_l_ = 0;
+  maxpool->pad_r_ = 0;
+
+  size_t input_size;
+  size_t y_data_size = maxpool->output_batch_ * maxpool->output_channel_ * maxpool->output_h_ * maxpool->output_w_;
+
+  auto x_data = reinterpret_cast<float *>(
+    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_2_x_1_30_30_3.bin", &input_size));
+  std::vector<int> dim_x({1, 30, 30, 3});
+  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
+  x_tensor.SetData(x_data);
+  std::vector<lite::tensor::Tensor *> maxpool_inputs = {&x_tensor};
+
+  auto y_data = new float[y_data_size];
+  std::vector<int> dim_y({1, 10, 10, 3});
+  lite::tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
+  y_tensor.SetData(y_data);
+
+  auto ind_data = new int[y_data_size];
+  lite::tensor::Tensor ind_tensor(TypeId::kNumberTypeInt32, dim_y);
+  ind_tensor.SetData(ind_data);
+
+  std::vector<lite::tensor::Tensor *> maxpool_outputs = {&y_tensor, &ind_tensor};
+
+  kernel::KernelKey maxpool_desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Pooling};
+  auto maxpool_creator = lite::KernelRegistry::GetInstance()->GetCreator(maxpool_desc);
+  auto maxpoolobj = maxpool_creator(maxpool_inputs, maxpool_outputs, reinterpret_cast<OpParameter *>(maxpool),
+                                    NULL, maxpool_desc);
+  maxpoolobj->Run();
+
+  printf("==================indices data=================\n");
+  for (int i = 0; i < 10; i++) {
+    std::cout << ind_data[i] << " ,";
+  }
+  std::cout << std::endl;
+
+  auto pooling_param = new PoolingParameter();
+  InitPoolingParamFP32(pooling_param);
+  pooling_param->avg_pooling_ = false;
+  pooling_param->max_pooling_ = true;
+  pooling_param->input_h_ = 10;
+  pooling_param->input_w_ = 10;
+  pooling_param->input_channel_ = 3;
+
+  pooling_param->output_batch_ = 1;
+  pooling_param->output_h_ = 30;
+  pooling_param->output_w_ = 30;
+  pooling_param->output_channel_ = 3;
+
+  // runtime part
+  printf("Calculating runtime cost...\n");
+  // uint64_t time_avg = 0;
+  size_t output_data_size =
+    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;
+
+  auto dy_data = reinterpret_cast<float *>(
+    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_2_dy_1_10_10_3.bin", &input_size));
+  std::vector<int> dim_dy({1, 3, 10, 10});
+  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
+  dy_tensor.SetData(dy_data);
+
+#if 0
+  std::string i_path = "./test_data/pooling/maxpoolgradfp32_2_i_1_3_10_10.bin";
+  auto ill_data = reinterpret_cast<int64_t*>(mindspore::lite::ReadFile(i_path.c_str(), &input_size));
+  auto i_data = new int[output_data_size];
+  for (int i=0; i < output_data_size; i++)
+    i_data[i] = static_cast<int>(ill_data[i]);
+  std::vector<int> dim_ind({1, 3, 10, 10});
+  lite::tensor::Tensor ind_tensor(TypeId::kNumberTypeInt32, dim_ind);
+  ind_tensor.SetData(i_data);
+#endif
+
+  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &ind_tensor};
+
+  auto output_data = new float[output_data_size];
+  std::vector<int> dim_dx({1, 3, 30, 30});
+  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
+  dx_tensor.SetData(output_data);
+  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
+
+  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
+  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(pooling_param), NULL, desc);
+  kernel_obj->Run();
+
+  printf("==================output data=================\n");
+  for (int i = 0; i < 20; i++) {
+    std::cout << output_data[i] << " ,";
+  }
+  std::cout << std::endl;
+  std::string output_path = "./test_data/pooling/maxpoolgradfp32_2_dx_1_30_30_3.bin";
+  lite::CompareOutput(output_data, output_path);
+
+  // delete input_data;
+  // delete[] output_data;
+  delete pooling_param;
+  MS_LOG(INFO) << "TestMaxPoolingKernelGradFp32 passed";
+}
+#endif  // if 0 before MaxPoolingKernelGradFp32
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_out_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_out_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9418b5866b472c97233ce255d056b3f84527df44
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_out_50.bin
@@ -0,0 +1 @@
+�"x>#�>K9�>pR>)J	>��4>�K>��Z>��>��>��L>��=�Q>�*^>M�>&�>6>S�>�*�>�N>�-�=�+L>�vK>+A}>w�^>$�Q>��s>�/W>���=M�'>9[*>#%�<#�>C�>>��>$��=G�j>��>�7*>��2>��6>��>�1p>�s#>Y)>��k>9��=���=lQ0>��w>
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_x_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_x_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d216d74d9dbe53d43de8f103f81fd8416ff55ceb
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_x_50.bin
@@ -0,0 +1 @@
+�M�?Ƿ����?	�H2�|�7>0�?dyX?C�.�\fT��@?�ͳ������g?Lw񾫘žE���9&�7A�?�T?�XF�4�?�?�ҹ?(k?�0?��?�VH?-��Tz@�&����"-��1��w���?�F����?��羢D���>��Y>���_p�?����]�	�?�%R�5��Ks�=��?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_yt_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_yt_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..23f04a201536070ad6fd6e5d0d2dcefeba6b581c
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_yt_50.bin
@@ -0,0 +1 @@
+��?�6V?�U�?��S?=�M?;��?3P�?;�?��E?�Ln?u��?�!?��V?��?s�W?9_?�e?}�H?h��?��?X�=?� �?%��?��?Y1�?[s�?�?�c�?��?t�{?Ո?�7�=�DK?e�W?���?��?�>�?kcY?�S?��?�?_fQ?u%�?�-u?�}?���?k�9?��?=�?���?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_out_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_out_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6ff3dd84c97a76ff7d3b7e59c30d5c9688edd2a7
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_out_50.bin
@@ -0,0 +1 @@
+v��=q�ٽ�Bs>��������Q=@�<U��?P2�4[�?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_x_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_x_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b9341ce21247a698c9ff168e38a43696581c3836
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_x_50.bin
@@ -0,0 +1 @@
+/��֐���5�>�"���\����=`ο�;�?�廿���?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_yt_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_yt_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b25a5c778789a510adef8e4bfaeab774aed1d472
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_yt_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_out_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_out_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b13b02ddbc42052e6cec58ba089a97ea76390f24
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_out_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_y_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_y_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6031302671540be073958caf9e0510f85696f72f
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_y_50.bin
@@ -0,0 +1 @@
+�����&�5Sa����?t�?��@W�,����2ս&�8?;V����?橡?$�?5���pN��F7�:��?�5V�:΄?�m��,!@���`|�>V���ؚ�?_�B?0Խ��"?��ｑ!>%�=��,?����>�Ѓ?�;?�q��Gh�?7��<����U>=��?�-�a�p?��g?���>�r@X�>
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_yt_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_yt_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cd15ef7f91202ece4cd01401a6de73b64d1c769
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_yt_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_out_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_out_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0e9de34e838a98b8f169c1d10b4673b62786470d
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_out_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_y_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_y_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..095be3ca3a533381324322799563c12842504d3c
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_y_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_yt_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_yt_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0e9de34e838a98b8f169c1d10b4673b62786470d
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_yt_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_out_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_out_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dc7098eca0e8ef210e6b3f762d83b8a78c06fd7e
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_out_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_y_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_y_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..00d6139a772c8f73137107da2fea797b47537793
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_y_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_yt_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_yt_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dc7098eca0e8ef210e6b3f762d83b8a78c06fd7e
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_yt_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_out_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_out_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8fcc885497be036600bfe56f29152b5efda22019
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_out_50.bin
@@ -0,0 +1 @@
+��]>E�>�Jn>bK>�8�=�<�P�>&��>�g>]��<WBX=S�t>�;�>�Q>I�>���=\�>��S>	ŀ=�C*>�K=�n>Iy�>���>�l�>/=�>rp>���>��>�(	>��[>-�>{�j=�4�>C��>�e�>D�>B�=��=x�>/m�>v�j>P�>�v�=Pʕ>��=�3>vN=� �>ӂ�>
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_y_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_y_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e3724e2c0d5bef22418627f4449bdbae32e824c
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_y_50.bin
@@ -0,0 +1 @@
+w�>���>XO�>?��>��h>��=��%?�o:?9��>"q�=��7>� �>??� �>9�?��{>�t?D2\?�J>n��>��1>���>�OF?/�?7y?J0?�eT?A?F$�>'��>Ab�>��#?"m@>�<?�8?�?�?��Z>�$i>�8?�*C?�)�>r�3?ᆒ>�X?9y>���>^�2>S�??w!'?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_yt_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_yt_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ca66395f8fc72c2f095a7bc2f61592226018a152
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_yt_50.bin
@@ -0,0 +1 @@
+wa?��"?XOo??�Q?���>�E>���?�o�?9�i?"qv>���>� u??�?� W?9��?���>�t�?D2�?��>n�8?���>��o?�O�?/Ǚ?7y�?J�?�e�?A�?F$&?'�?Ab_?���?"m�>˼?��?�?��?���>�$�>��?�*�?�)l?r�?�?�X�?9�>��??^�>SԿ?w!�?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_out_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_out_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e3e18c43c380af8ca4cf7fbc62ca0f418af13f5
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_out_50.bin
@@ -0,0 +1,2 @@
+��@?^�*�Su>?���(1?Y�?O]�>8��>y�ͽ�h>Y�:��ן<e
+?@?C?�����C?6GU�p�>�_=�I��0`�>�0�>ݎ9?��;?Gs*>e3>��?�ʑ>��;?(�,?�&�3*����?��C?�C�w<2?���=�K�>%HC����%8?�M�>������~>'u�>J���I>^4Y�uZ?��
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_y_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_y_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0a9a479053216ca28ee149d7a28170c1f82b3ecb
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_y_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_yt_50.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_yt_50.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7323533d2f7db7e605f0718712a06a961f732344
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_yt_50.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_32_3_3_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_32_3_3_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_18_3_3_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_18_3_3_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_d2_18_3_3_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_d2_18_3_3_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_d2_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_d2_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_1_28_28_32.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_1_28_28_32.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_1_28_28_18.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_1_28_28_18.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_32_3_3_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_32_3_3_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_18_3_3_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_18_3_3_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1dc4bf74d844821f465898bd386b125b9be17e3d
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_18_3_3_3.bin
@@ -0,0 +1,4 @@
+�F�.��N�2����?󻾩`?��ͽ��ؿS�į��2x��R=}�%��T�?9�>�R�?E�ÿ��?���>��?��@<�����*�F���s�?��h����>�۾i) ��W>+�;���=y�@\�?��V=~?�)о�Ϭ?HF}?�տ�ի?F꓿�E
+?�G���ÿ#μ�>P�D>���>J��?gN�Y,	<�ֈ���u?Y_�"�����4?��f����x��	Y�7�;¡̾���?�)�?���?��?�]@-�/�z�b?���Y���e?M���	/6?�"��?�t�?��T?;	-���1?,6?����.�>n��>8�D?�Ǿ
+�F��+j?~B?
+����P���?��?t ���ek?��I?W�J>��&?�� ?;�;�隿�j��=������sg�?����[k�?��r?ݖc>.����l��jy?�D�>S��¿��?l��?rS��q����?#m(@�_?>��l ���%6���?�<j���z>h%��k�>�=4������?�ŋ�o�J���������s>��fW�?8c;?��?�k���:�?bQ1���Y>yp�>�nW=�z�|S:?P��?�r�?�K�?���k���w��>��-��>�V�?�~>
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_10x4.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_10x4.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9d152d6f6cdf16f555e2edf864c494d7ed06ed47
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_10x4.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_4x10.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_4x10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2352a3989a1c367bfc0a260c1d1f871555a8da23
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_4x10.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_10x5.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_10x5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a6df1ed831a045b7950ad2a3d0a285e242924003
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_10x5.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_5x10.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_5x10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..eb28b692f68f92bf059c718d5ef583973bbfddd0
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_5x10.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_c_4x5.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_c_4x5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..183a9f0ba57476bf28191ad70a7850a8793c4954
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_c_4x5.bin
@@ -0,0 +1 @@
+.��@8�|��A-=,���fAQ�>2@du��i}?���t�4@���@2����zN@���ԣx@�&�(ӂ���e@�g��
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx1_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx1_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fcb5df39266ce621bdb133949f02f82f5fb4d302
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx1_5_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aa7bc323f733a3245bd16a390fcc620ec9dfb505
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dy_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dy_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c68b9c80b0518116c96380bb4f6b2c6944399a73
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dy_5_4_6.bin
@@ -0,0 +1 @@
+�&J�v���B�AoL�?�̓�I?����̿��A6�Խ�S��?���������d>��5��?��i���ӿ`�@�����u�@G��@`M>A���v>B)��>c����@$�/Aw�A�����˿�^ ��0�kܾȁ�Af���r�>0x˿�c��R�?�vu��=��`,>pŔ?aK����@������y?�׾�db�3�?�@ڤ�����eK?���Ч9�)��?u�u@��?�"�=�P�>b��>v�@�Nl@�Ð�U�>ot���?�*�@y	;��	@��Av���_���俶��\���q�?�w@�0�ݻ���j�?����Aq;�b�o���,��3@�I`?3sf�l�����@@I�?������?�AC_>=�L�����@���?	.����@xy`?��῿��A33��ˑ?�n�?.�����=�\B@/���A�>B_������K����<w�Wcz=m9=
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx1_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx1_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..37947edcb8da8f41f5860202b700368a9fe1f039
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx1_4_6.bin
@@ -0,0 +1 @@
+�wӿ��?8��>GF�?-�^��.;?����V]�?K�@z�:�} Q��P��q�H�� ſ���?��,@~�?LP�>7>�P����q@���P��>���@
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx2_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx2_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1676dc671fba382cada419cff74f0b1bef99edb5
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx2_1_6.bin
@@ -0,0 +1 @@
+L&������h>)A�[7?�.��O2�@
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dy_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dy_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..37947edcb8da8f41f5860202b700368a9fe1f039
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dy_4_6.bin
@@ -0,0 +1 @@
+�wӿ��?8��>GF�?-�^��.;?����V]�?K�@z�:�} Q��P��q�H�� ſ���?��,@~�?LP�>7>�P����q@���P��>���@
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x1_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x1_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a093c882f6343999f7451710fd95ba7e4a736519
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x1_4_6.bin
@@ -0,0 +1 @@
+����.%�?����.s?d���-f�=<V�nߔ?`Dz?�t���"��?���e|�>�~���޾�I�?��??�ƽ�Y?��3�B�?��X�f�<?�'@
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x2_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x2_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..668b5ae95ee07a730417fbfeb4b2dbedde5eda6a
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x2_1_6.bin
@@ -0,0 +1 @@
+kB����t�?[�[�����T�>
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx1_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx1_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6737f96700fa3514ac6f83dbc461eec7409b3881
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx1_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx2_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx2_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..94bd26e133b77350ba9194c3b3caa7e47642d1fa
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx2_1_6.bin
@@ -0,0 +1 @@
+���2�����)A%���dȑ���	�
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dy_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dy_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6737f96700fa3514ac6f83dbc461eec7409b3881
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dy_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx1_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx1_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6737f96700fa3514ac6f83dbc461eec7409b3881
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx1_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx2_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx2_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..94bd26e133b77350ba9194c3b3caa7e47642d1fa
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx2_1_6.bin
@@ -0,0 +1 @@
+���2�����)A%���dȑ���	�
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dy_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dy_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a4130257de45f33ca3b375ff3d8d178872776118
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dy_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dx1_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dx1_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b8cc5229db710bd7af730291cc606dab1e5ce7e7
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dx1_4_6.bin
@@ -0,0 +1,2 @@
+;���p�
+?��y����=x�J�ɷi<%.v�e!?�N[@I��(*8�Iኽ}��>�	�$!ÿi>��u?J�W���y?R8���]�?�z��{�?�5�>
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dx2_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dx2_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..40d97286c83e81dc6c3b612d19e9acd9d0b7b837
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dx2_1_6.bin
@@ -0,0 +1 @@
+b殿Q.*�(�@0���˸�o�k@
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dy_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dy_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3cac9bfc200f002716402184b6b7586681b8917e
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_4_dy_4_6.bin
@@ -0,0 +1 @@
+ao�=�X��D�<�ԟо���?$_`=US�>]�����%@��?,�?�S��G+�b��>&e����+���ѿ#�N������v1>�b�?�<x]����?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dx1_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dx1_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f556b5dea2f1bd2c4235facf277aef731234a
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dx1_4_6.bin
@@ -0,0 +1 @@
+�Oq���A�����4%B���#�F@�$:�+�Av̎??�X��d����k���?�$�v����B�iAG 7���<@y_����$?�4�g�@��B
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dx2_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dx2_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d141f7e46f74afecdfbedad92923aa204520a7c6
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dx2_1_6.bin
@@ -0,0 +1 @@
+?�@3B46�E�xD<DB	sH�
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dy_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dy_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..26cee6843bbbfb8a0d6af24258df644d3568c9b0
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_5_dy_4_6.bin
@@ -0,0 +1,2 @@
+$�6>\���i׿׺�H|@�N?�&?�ǖ�b�?��9A{�d@��u�h?w��N�?�3(�cvi�5Ϙ��>���t
+6?ngZ?���>�� �/T�A
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dx1_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dx1_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4fe396e1bc150ad956af7b8602e6757fd0a076d5
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dx1_1_6.bin
@@ -0,0 +1 @@
+YA���:�ۢ�A93�*���B%�B
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dx2_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dx2_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..35a86f48b51240092c784af514c212c08a869719
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dx2_4_6.bin
@@ -0,0 +1 @@
+gE|:��J"@��۽��>�'�����B����tp���B=�?C�?[^��9��Ano*B��ļ-h��V�C����D�B�����YD\�ۿ���
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dy_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dy_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4bfe040d8ca8e26da89e09f67b3a5ac7b861036e
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_6_dy_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dx1_4_5_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dx1_4_5_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..32e73ebd1a59a71d7b9cafad77c73a29792021d2
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dx1_4_5_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dx2_1_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dx2_1_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..06a8bb81819565622b30e10794bdf9d3a8ea00ec
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dx2_1_1_6.bin
@@ -0,0 +1 @@
+M�A=@T�Y$�¦&+����f��
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dy_4_5_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dy_4_5_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3a75ce88edd50937fedf5f93eaa74791ddc77b33
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_dy_4_5_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_x1_4_5_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_x1_4_5_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e5104b8e39e8c6fe018c5934c6ee8b21c874c58b
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_x1_4_5_6.bin
@@ -0,0 +1,2 @@
+5^�x�|>��?NW��ւ��pټ�R4�=p?����v֥?R�ݾQ��>�(v?p�j?��H?G�)>I޿�ʟ>��u�{`Խ��>�����Ԁ�!O?9�;>c�>?�\���7K?cb���_?.�k��u��S�=��j?1�=y�?p���)P?���=V�	?�.�>ј�?�H?*��w�M��H�������?��\�q��>(z࿣�����?��-���=�?y�?��ƿ�׻�+_�����?d���G�p	�X�?Q���t�>�����F��?N��=N�����
+�Yn`��%G?)�$?Jdi?7��"8��)���{�o����Ͼw����>�(L?Pԝ��'2?u�?2��?�"�>!פ?�?�?<�߾f�?�9F��&Q�EE}<�:G>�.�}�.�������S?\�̿K?*V���%?�Q����꿹uy�C7���:?�T4>`�>o>!?�>;���y�_�
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_x2_1_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_x2_1_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b0921a2d80bc93374e13ac96cd3ab0ff2a778128
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_7_x2_1_1_6.bin
@@ -0,0 +1 @@
+������<>�o8?�#�?d�@ٌ<
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dx1_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dx1_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..199d5821733aa9fa063141ad08b318b442d5a522
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dx1_5_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dx2_5_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dx2_5_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ea5dbc6a93c0f4d62a7f5edfbbe453ce12661a1b
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dx2_5_1_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dy_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dy_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..199d5821733aa9fa063141ad08b318b442d5a522
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_dy_5_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_x1_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_x1_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d14440793e805aa258e892763ba286414278fb17
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_x1_5_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_x2_5_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_x2_5_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9b6169666c7e2ea3861264db2b9ae128d87e02de
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_8_x2_5_1_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dx1_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dx1_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5824f414cfa339ade5c4b7008f205129063a623f
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dx1_5_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bee4fa98bc60baf50da59ac8847941b548c72321
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin
@@ -0,0 +1,2 @@
+��@Up����Q@(��A�,@N5�����@����(q�?O)X@!g�@��}@+��?�?B��>+��?�M�@��,@�@��eA��O�F��@�ۉ����@%�A������N��*�
+��+|�
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dy_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dy_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dde1dd8cbc6da1c723ef49e774c4e3a368f33ea4
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_dy_5_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_x1_5_4_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_x1_5_4_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d14440793e805aa258e892763ba286414278fb17
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_x1_5_4_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_x2_5_1_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_x2_5_1_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9b6169666c7e2ea3861264db2b9ae128d87e02de
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_9_x2_5_1_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/biasgradfp32_1_db_7.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/biasgradfp32_1_db_7.bin
new file mode 100644
index 0000000000000000000000000000000000000000..46853507f8e63a02e44b66514730cffc2c388d75
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/biasgradfp32_1_db_7.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/biasgradfp32_1_dy_10_28_28_7.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/biasgradfp32_1_dy_10_28_28_7.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c079b77a367c198c8b6da19257eb43ca89f5d605
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/biasgradfp32_1_dy_10_28_28_7.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_dy_6_4.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_dy_6_4.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d135477618dbadaf3294fbf815d26b51831aa577
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_dy_6_4.bin
@@ -0,0 +1,2 @@
+�,Q=���dt=*��<���ʮ�<��'=䜶=�_�<��ὥ
+=��i=�ӭ=�a�:�<��=AL<�?!�"�=�U�<V��=��<���DΕ;
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_l_6.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_l_6.bin
new file mode 100644
index 0000000000000000000000000000000000000000..73d772e3b09ef52111fde7193e0d7f530e96ec2d
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_l_6.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_loss_1.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_loss_1.bin
new file mode 100644
index 0000000000000000000000000000000000000000..80a2fb094e336e263f02c77bc784c3288689cd5f
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_loss_1.bin
@@ -0,0 +1 @@
+�?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_y_6_4.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_y_6_4.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b3bbd23ef496f487075c0a282fd26a44f193422c
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/sce_fp32_1_y_6_4.bin
@@ -0,0 +1 @@
+�"��愿ę���/���^>#��wDh?>��?���>�>a���>S��?���>��>*D^?�����[���?�Q�=��?56�>ȭ?����
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e5d086ef061316adad5650c336a9ff7ce158c8e3
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_dy_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_dy_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ad87262c97b758e251a9ad648ec5311cc6bfe0e6
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_dy_1_28_28_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_x_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_x_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d32ffbe755462f66fdb69c3585ca43e787be2296
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolgradfp32_1_x_1_28_28_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_dx_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_dx_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cca67a85df1163f95027823b6502328f323f4555
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_dx_1_28_28_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_dy_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_dy_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..15c810365e291c8ced141d0cfec88668882d99dc
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_dy_1_28_28_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_i_1_28_28_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_i_1_28_28_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c50b6145ab474febecb071cbc1237a8a6babb4d0
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_1_i_1_28_28_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dx_1_30_30_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dx_1_30_30_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e2ee3307f9f464025eea0a473e7921a1c0ce282c
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dx_1_30_30_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dy_1_10_10_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dy_1_10_10_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0985d76b035273333eb1f6f69b1276a3f4aecda6
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dy_1_10_10_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dy_1_3_10_10.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dy_1_3_10_10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c95d4ce0fa1a53c68a3f38a00505bad5ec7b4ea7
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_dy_1_3_10_10.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_i_1_10_10_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_i_1_10_10_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d5647f32dc7d45efa3283cbf26ec277dd030e70b
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_i_1_10_10_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_i_1_3_10_10.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_i_1_3_10_10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..89157fd9dd7289a92a7f1654e256139966c8c6e5
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_i_1_3_10_10.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_x_1_30_30_3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_x_1_30_30_3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..61cfb07fdb62835ef471cc0b277cb50fae2ee0e7
Binary files /dev/null and b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolgradfp32_2_x_1_30_30_3.bin differ
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_dx_scale5_shift2_power3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_dx_scale5_shift2_power3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0bb3ced76525b2182d5b247ddf40af2849bf760c
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_dx_scale5_shift2_power3.bin
@@ -0,0 +1,2 @@
+N�G�^a��FN.�c�J5G
+���WExe�ء��?wD�RG��TEA�C_��>Y�HÜ*I��IM�.��IEB���:g���$I�H�p�ñBF��C��B�tHj�C�{@FIW>�t��F�ӦJE1�Fx3E~��#)JI��E4'nI��B��
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_dy_scale5_shift2_power3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_dy_scale5_shift2_power3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b1e404b742da5cd3ef913243701371920176fe4d
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_dy_scale5_shift2_power3.bin
@@ -0,0 +1 @@
+��N�#��LS�R�Di]���	Bo;E�g9��WN�A{�C�B��@�I�4~D>�PD �D��S��pBH � ���VNLD��Cm�����B*lA֓\@i�C�A���B��^��t�B�+3E�CO�A�s���`�D3DeB��~D`�v@��:�
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_x_scale5_shift2_power3.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_x_scale5_shift2_power3.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d902be1588fdff4fb65acbfdbd2fbceb8542ddfa
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/power/powerfp32_x_scale5_shift2_power3.bin
@@ -0,0 +1 @@
+��h�)jr�_O��%�?�"��=�������|;9$�>���=�D��w���9u?��?PҴ?~q�<��=���BͿ9Ċ?�,?��6����>8���$�du5?g����b�>")H�`��>q�?I��>.��=,�%��?$!Y>�H�?|�����
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/train/train_bias_10.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/train/train_bias_10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..527f9f7399d5a7b40d66f4101cb4189d4ad97a7f
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/train/train_bias_10.bin
@@ -0,0 +1 @@
+���>SY��@+[K�c�߾&��O�?-�?��Q��jt?
\ No newline at end of file
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/train/train_input_32_1000.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/train/train_input_32_1000.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/train/train_weight_10_1000.bin b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/train/train_weight_10_1000.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391