!3936 [lite]compare,batch_to_space,depth_to_space,argmin,argmax support int8

Merge pull request !3936 from chenjianping/lite_dev

!3936 [lite]compare,batch_to_space,depth_to_space,argmin,argmax support int8
Merge pull request !3936 from chenjianping/lite_dev
00fa08ba · mindspore-ci-bot · Gitee · 4b2e26f1 · 32960a58 · 00fa08ba
26 changed file
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.h
@@ -90,6 +90,7 @@ class ArithmeticCPUKernel : public LiteKernel {
      case PrimitiveType_FloorMod:
        arithmetic_run_ = ElementFloorMod;
        arithmetic_broadcast_run_ = BroadcastFloorMod;
+        break;
      case PrimitiveType_Equal:
        arithmetic_run_ = ElementEqual;
        arithmetic_broadcast_run_ = BroadcastEqual;

--- a/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.cc
@@ -17,7 +17,7 @@
 #include <vector>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/opclib/arg_min_max.h"
+#include "src/runtime/kernel/arm/opclib/int8/arg_min_max_int8.h"
 #include "include/errorcode.h"

 using mindspore::lite::RET_OK;
@@ -31,12 +31,45 @@ int ArgMinMaxInt8CPUKernel::Init() {
  }
  auto param = reinterpret_cast<ArgMinMaxParameter *>(opParameter);
  param->data_type_ = kNumberTypeInt8;
+  auto *input_tensor = inputs_.at(kInputIndex);
+  auto in_quant_args = input_tensor->GetQuantParams();
+  in_quant_arg_.scale_ = in_quant_args.front().scale;
+  in_quant_arg_.zp_ = in_quant_args.front().zeroPoint;
+
+  auto *out_tensor = outputs_.at(kOutputIndex);
+  auto out_quant_args = out_tensor->GetQuantParams();
+  out_quant_arg_.scale_ = out_quant_args.front().scale;
+  out_quant_arg_.zp_ = out_quant_args.front().zeroPoint;
  return RET_OK;
 }

 int ArgMinMaxInt8CPUKernel::Run() {
-  auto ret = ArgMinMaxBaseCPUKernel::Run();
-  FreeTmpMemory();
-  return ret;
+  auto input = inputs_.at(0);
+
+  const int8_t *input_data = reinterpret_cast<const int8_t *>(inputs_.at(0)->Data());
+  int8_t *output_data = reinterpret_cast<int8_t *>(outputs_.at(0)->Data());
+
+  auto in_shape = input->shape().data();
+  auto param = reinterpret_cast<ArgMinMaxParameter *>(opParameter);
+  if (param->topk_ == 1) {
+    ArgMinMaxQuant(input_data, output_data, in_shape, param, &in_quant_arg_, &out_quant_arg_);
+    return RET_OK;
+  }
+
+  switch (param->axis_) {
+  case 0:
+    ArgMinMaxDim0(input_data, output_data, in_shape, param, &in_quant_arg_, &out_quant_arg_);
+    break;
+  case 1:
+    ArgMinMaxDim1(input_data, output_data, in_shape, param, &in_quant_arg_, &out_quant_arg_);
+    break;
+  case 2:
+    ArgMinMaxDim2(input_data, output_data, in_shape, param, &in_quant_arg_, &out_quant_arg_);
+    break;
+  case 3:
+    ArgMinMaxDim3(input_data, output_data, in_shape, param, &in_quant_arg_, &out_quant_arg_);
+    break;
+  }
+  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h
@@ -18,6 +18,7 @@

 #include <vector>
 #include "src/runtime/kernel/arm/base/arg_min_max_base.h"
+#include "src/runtime/kernel/arm/opclib/quantization/quantize.h"

 namespace mindspore::kernel {
 class ArgMinMaxInt8CPUKernel : public ArgMinMaxBaseCPUKernel {
@@ -31,6 +32,9 @@ class ArgMinMaxInt8CPUKernel : public ArgMinMaxBaseCPUKernel {
  int Init() override;
  int ReSize() override { return 0; }
  int Run() override;
+ private:
+  QuantArg in_quant_arg_;
+  QuantArg out_quant_arg_;
 };
 }  // namespace mindspore::kernel


--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/int8/arithmetic_int8.h"
+#include "src/runtime/kernel/arm/opclib/int8/arithmetic_int8.h"
+#include "src/runtime/kernel/arm/opclib/arithmetic_common.h"
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/runtime_api.h"
+#include "include/errorcode.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_PARAM_INVALID;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+using mindspore::schema::PrimitiveType_Equal;
+using mindspore::schema::PrimitiveType_NotEqual;
+using mindspore::schema::PrimitiveType_LessEqual;
+using mindspore::schema::PrimitiveType_Greater;
+using mindspore::schema::PrimitiveType_GreaterEqual;
+using mindspore::schema::PrimitiveType_Less;
+
+namespace mindspore::kernel {
+namespace {
+int ArithmeticsInt8Launch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto arithmetic_kernel = reinterpret_cast<ArithmeticInt8CPUKernel *>(cdata);
+  auto error_code = arithmetic_kernel->DoArithmetic(thread_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "ArithmeticsRun error thread_id[" << thread_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+}  // namespace
+
+ArithmeticInt8CPUKernel::~ArithmeticInt8CPUKernel() {
+  auto param = reinterpret_cast<ArithmeticParameter *>(opParameter);
+  if (!param->broadcasting_) {
+    return;
+  }
+  if (context_->allocator != nullptr) {
+    if (tile_data0_ != nullptr) {
+      context_->allocator->Free(tile_data0_);
+    }
+    if (tile_data1_ != nullptr) {
+      context_->allocator->Free(tile_data1_);
+    }
+  } else {
+    if (tile_data0_ != nullptr) {
+      free(tile_data0_);
+    }
+    if (tile_data1_ != nullptr) {
+      free(tile_data1_);
+    }
+  }
+  tile_data0_ = nullptr;
+  tile_data1_ = nullptr;
+}
+
+int ArithmeticInt8CPUKernel::Init() {
+  switch (opParameter->type_) {
+    case PrimitiveType_Equal:
+      arithmetic_run_ = ElementEqual;
+      break;
+    case PrimitiveType_NotEqual:
+      arithmetic_run_ = ElementNotEqual;
+      break;
+    case PrimitiveType_Less:
+      arithmetic_run_ = ElementEqual;
+      break;
+    case PrimitiveType_LessEqual:
+      arithmetic_run_ = ElementNotEqual;
+      break;
+    case PrimitiveType_Greater:
+      arithmetic_run_ = ElementGreater;
+      break;
+    case PrimitiveType_GreaterEqual:
+      arithmetic_run_ = ElementGreaterEqual;
+      break;
+    default:
+      MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
+      arithmetic_run_ = nullptr;
+      return RET_PARAM_INVALID;
+  }
+  auto data_size = outputs_[0]->Size();
+  auto param = reinterpret_cast<ArithmeticParameter *>(opParameter);
+  if (param->broadcasting_) {
+    if (context_->allocator != nullptr) {
+      tile_data0_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(data_size));
+      tile_data1_ = reinterpret_cast<int8_t *>(context_->allocator->Malloc(data_size));
+    } else {
+      tile_data0_ = reinterpret_cast<int8_t *>(malloc(data_size));
+      tile_data1_ = reinterpret_cast<int8_t *>(malloc(data_size));
+    }
+  } else {
+    tile_data0_ = nullptr;
+    tile_data1_ = nullptr;
+  }
+  return RET_OK;
+}
+
+int ArithmeticInt8CPUKernel::ReSize() { return RET_OK; }
+
+int ArithmeticInt8CPUKernel::DoArithmetic(int thread_id) {
+  auto input0_data = reinterpret_cast<int8_t *>(inputs_[0]->Data());
+  auto input1_data1 = reinterpret_cast<int8_t *>(inputs_[1]->Data());
+  auto output_data = reinterpret_cast<int8_t *>(outputs_[0]->Data());
+  auto element_num = outputs_[0]->ElementsNum();
+  auto param = reinterpret_cast<ArithmeticParameter *>(opParameter);
+  if (param->broadcasting_ && arithmetic_run_ != nullptr) {
+    MS_ASSERT(thread_count_ != 0);
+    int stride = UP_DIV(element_num, thread_count_);
+    int count = MSMIN(stride, element_num - stride * thread_id);
+
+    int error_code = arithmetic_run_(tile_data0_ + stride * thread_id, tile_data1_ + stride * thread_id,
+                                     output_data + stride * thread_id, count);
+    if (error_code != RET_OK) {
+      MS_LOG(ERROR) << "Arithmetic run fail! ret: " << error_code;
+      return RET_ERROR;
+    }
+  } else if (arithmetic_run_ != nullptr) {
+    int error_code = arithmetic_run_(input0_data, input1_data1, output_data, element_num);
+    if (error_code != RET_OK) {
+      MS_LOG(ERROR) << "Arithmetic run fail!ret: " << error_code;
+      return RET_ERROR;
+    }
+  } else {
+    MS_LOG(ERROR) << "arithmetic_run function is nullptr!";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ArithmeticInt8CPUKernel::Run() {
+  auto param = reinterpret_cast<ArithmeticParameter *>(opParameter);
+  if (param->broadcasting_) {
+    auto input_data0 = reinterpret_cast<int8_t *>(inputs_[0]->Data());
+    auto input_data1 = reinterpret_cast<int8_t *>(inputs_[1]->Data());
+    TileDimensionsInt8(input_data0, input_data1, tile_data0_, tile_data1_, param);
+  }
+  int error_code = LiteBackendParallelLaunch(ArithmeticsInt8Launch, this, thread_count_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Arithmetic launch function fail! ret: " << error_code;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuArithmeticInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                                   const std::vector<lite::tensor::Tensor *> &outputs,
+                                                   OpParameter *parameter, const lite::Context *ctx,
+                                                   const kernel::KernelKey &desc) {
+  if (parameter == nullptr) {
+    MS_LOG(ERROR) << "Input parameter is null!";
+    return nullptr;
+  }
+  auto kernel = new (std::nothrow) ArithmeticInt8CPUKernel(parameter, inputs, outputs, ctx);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Create ArithmeticInt8CPUKernel failed, name: " << parameter->name_;
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << parameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(parameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Equal, CpuArithmeticInt8KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_NotEqual, CpuArithmeticInt8KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Less, CpuArithmeticInt8KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_LessEqual, CpuArithmeticInt8KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Greater, CpuArithmeticInt8KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_GreaterEqual, CpuArithmeticInt8KernelCreator)
+
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_ARITHMETIC_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_ARITHMETIC_INT8_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "schema/model_generated.h"
+
+namespace mindspore::kernel {
+class ArithmeticInt8CPUKernel : public LiteKernel {
+  typedef int (*ArithmeticRunInt8)(int8_t *input0, int8_t *input1, int8_t *output, int element_size);
+
+ public:
+  ArithmeticInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
+                          const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
+      : LiteKernel(parameter, inputs, outputs), thread_count_(ctx->thread_num_), context_(ctx) {}
+  ~ArithmeticInt8CPUKernel();
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  int DoArithmetic(int thread_id);
+
+ private:
+  int thread_count_;
+  int8_t *tile_data0_;
+  int8_t *tile_data1_;
+  const lite::Context *context_;
+  ArithmeticRunInt8 arithmetic_run_;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_ARITHMETIC_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.cc
@@ -18,13 +18,27 @@
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/opclib/batch_to_space.h"
+#include "src/runtime/kernel/arm/opclib/int8/batch_to_space_int8.h"
 #include "include/errorcode.h"

 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 int BatchToSpaceInt8CPUKernel::Init() {
-  return BatchToSpaceBaseCPUKernel::Init();
+  auto ret = BatchToSpaceBaseCPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
+  }
+  auto *input_tensor = inputs_.at(kInputIndex);
+  auto in_quant_args = input_tensor->GetQuantParams();
+  in_quant_arg_.scale_ = in_quant_args.front().scale;
+  in_quant_arg_.zp_ = in_quant_args.front().zeroPoint;
+
+  auto *out_tensor = outputs_.at(kOutputIndex);
+  auto out_quant_args = out_tensor->GetQuantParams();
+  out_quant_arg_.scale_ = out_quant_args.front().scale;
+  out_quant_arg_.zp_ = out_quant_args.front().zeroPoint;
+  return RET_OK;
 }

 int BatchToSpaceInt8CPUKernel::Run() {
@@ -36,12 +50,22 @@ int BatchToSpaceInt8CPUKernel::Run() {
  auto out_shape = output->shape();
  BatchToSpaceParameter *param = reinterpret_cast<BatchToSpaceParameter *>(this->opParameter);

-  if (IsNoCrop()) {
-    BatchToSpaceNoCropForNHWC(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_,
-                              sizeof(int8_t));
+  if (in_quant_arg_.scale_ == out_quant_arg_.scale_ && in_quant_arg_.zp_ == out_quant_arg_.zp_) {
+    if (IsNoCrop()) {
+      BatchToSpaceNoCropForNHWC(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_,
+                                sizeof(int8_t));
+    } else {
+      BatchToSpaceForNHWC(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_, param->crops_,
+                          sizeof(int8_t));
+    }
  } else {
-    BatchToSpaceForNHWC(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_, param->crops_,
-                        sizeof(int8_t));
+    if (IsNoCrop()) {
+      BatchToSpaceNoCropForNHWC(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_,
+                                &in_quant_arg_, &out_quant_arg_);
+    } else {
+      BatchToSpaceForNHWC(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_, param->crops_,
+                          &in_quant_arg_, &out_quant_arg_);
+    }
  }

  return RET_OK;

--- a/mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.h
@@ -31,6 +31,9 @@ class BatchToSpaceInt8CPUKernel : public BatchToSpaceBaseCPUKernel {
  int Init() override;
  int ReSize() override { return 0; }
  int Run() override;
+ private:
+  QuantArg in_quant_arg_;
+  QuantArg out_quant_arg_;
 };
 }  // namespace mindspore::kernel


--- a/mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.cc
@@ -18,6 +18,7 @@
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/opclib/depth_to_space.h"
+#include "src/runtime/kernel/arm/opclib/int8/depth_to_space_int8.h"
 #include "include/errorcode.h"

 using mindspore::lite::RET_OK;
@@ -31,6 +32,16 @@ int DepthToSpaceInt8CPUKernel::Init() {
  }
  DepthToSpaceParameter *param = reinterpret_cast<DepthToSpaceParameter *>(opParameter);
  param->data_type_size_ = sizeof(int8_t);
+
+  auto *input_tensor = inputs_.at(kInputIndex);
+  auto in_quant_args = input_tensor->GetQuantParams();
+  in_quant_arg_.scale_ = in_quant_args.front().scale;
+  in_quant_arg_.zp_ = in_quant_args.front().zeroPoint;
+
+  auto *out_tensor = outputs_.at(kOutputIndex);
+  auto out_quant_args = out_tensor->GetQuantParams();
+  out_quant_arg_.scale_ = out_quant_args.front().scale;
+  out_quant_arg_.zp_ = out_quant_args.front().zeroPoint;
  return RET_OK;
 }

@@ -41,14 +52,11 @@ int DepthToSpaceInt8CPUKernel::Run() {
  int8_t *output_data = reinterpret_cast<int8_t *>(output->Data());
  auto in_shape = input->shape();
  DepthToSpaceParameter *param = reinterpret_cast<DepthToSpaceParameter *>(opParameter);
-  if (input->GetFormat() == schema::Format_NHWC) {
+  if (in_quant_arg_.scale_ == out_quant_arg_.scale_ && in_quant_arg_.zp_ == out_quant_arg_.zp_) {
    DepthToSpaceForNHWC(input_data, output_data, in_shape.data(), param);
-    return RET_OK;
  } else {
-    MS_LOG(ERROR) << "Depth_to_space only support NHWC now!";
-    return RET_ERROR;
+    DepthToSpaceForNHWC(input_data, output_data, in_shape.data(), param, &in_quant_arg_, &out_quant_arg_);
  }
-
  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.h
@@ -31,6 +31,9 @@ class DepthToSpaceInt8CPUKernel : public DepthToSpaceBaseCPUKernel {
  int Init() override;
  int ReSize() override { return 0; }
  int Run() override;
+ private:
+  QuantArg in_quant_arg_;
+  QuantArg out_quant_arg_;
 };
 }  // namespace mindspore::kernel


--- a/mindspore/lite/src/runtime/kernel/arm/opclib/arg_min_max.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/arg_min_max.cc
@@ -15,10 +15,8 @@
 */
 #include "src/runtime/kernel/arm/opclib/arg_min_max.h"
 #include "src/runtime/kernel/arm/opclib/fp32/arg_min_max.h"
-#include "src/runtime/kernel/arm/opclib/int8/arg_min_max.h"

 #define FLOAT_DATA_TYPE 43
-#define INT8_DATA_TYPE  32

 void GetCalcParameter(const int *shape, int dims_number, int axis, int *pre_axis_count, int *axis_count,
                      int *after_axis_count) {
@@ -51,16 +49,6 @@ void ArgMinMaxTopk1(const void *input, void *output, const int *shape, ArgMinMax
      }
      break;
    }
-    case INT8_DATA_TYPE: {
-      if (param->get_max_) {
-        ArgMax(reinterpret_cast<const int8_t *>(input), reinterpret_cast<int8_t *>(output), param, pre_axis_count,
-               axis_count, after_axis_count);
-      } else {
-        ArgMin(reinterpret_cast<const int8_t *>(input), reinterpret_cast<int8_t *>(output), param, pre_axis_count,
-               axis_count, after_axis_count);
-      }
-      break;
-    }
    default:
      break;
  }
@@ -100,40 +88,6 @@ void ArgMinMaxTopknFp32(const float *input, float *output, const int *in_shape,
  }
 }

-void ArgMinMaxTopknInt8(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param) {
-  if (param->get_max_) {
-    switch (param->axis_) {
-      case 0:
-        ArgMaxDim0(input, output, in_shape, param);
-        break;
-      case 1:
-        ArgMaxDim1(input, output, in_shape, param);
-        break;
-      case 2:
-        ArgMaxDim2(input, output, in_shape, param);
-        break;
-      case 3:
-        ArgMaxDim3(input, output, in_shape, param);
-        break;
-    }
-  } else {
-    switch (param->axis_) {
-      case 0:
-        ArgMinDim0(input, output, in_shape, param);
-        break;
-      case 1:
-        ArgMinDim1(input, output, in_shape, param);
-        break;
-      case 2:
-        ArgMinDim2(input, output, in_shape, param);
-        break;
-      case 3:
-        ArgMinDim3(input, output, in_shape, param);
-        break;
-    }
-  }
-}
-
 void ArgMinMax(const void *input, void *output, const int *in_shape, ArgMinMaxParameter *param) {
  if (param->topk_ == 1) {
    ArgMinMaxTopk1(input, output, in_shape, param);
@@ -145,10 +99,6 @@ void ArgMinMax(const void *input, void *output, const int *in_shape, ArgMinMaxPa
      ArgMinMaxTopknFp32(reinterpret_cast<const float *>(input), reinterpret_cast<float *>(output), in_shape, param);
      return;
    }
-    case INT8_DATA_TYPE: {
-      ArgMinMaxTopknInt8(reinterpret_cast<const int8_t *>(input), reinterpret_cast<int8_t *>(output), in_shape, param);
-      return;
-    }
    default:
      break;
  }

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/arg_min_max.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/arg_min_max.h
@@ -18,6 +18,7 @@

 #include "src/runtime/kernel/arm/opclib/op_base.h"
 #include "src/runtime/kernel/arm/opclib/arg_min_max_parameter.h"
+#include "src/runtime/kernel/arm/opclib/quantization/quantize.h"

 void ArgMinMax(const void *input, void *output, const int *in_shape, ArgMinMaxParameter *param);
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_ARG_MIN_MAX_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/depth_to_space.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/depth_to_space.h
@@ -15,19 +15,7 @@
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_DEPTH_TO_SPACE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_DEPTH_TO_SPACE_H_
-#include "src/runtime/kernel/arm/opclib/op_base.h"
-
-struct DepthToSpaceParameter {
-    OpParameter op_parameter_;
-    int32_t block_size_;
-    int32_t in_stride_dim0_;
-    int32_t in_stride_dim1_;
-    int32_t in_stride_dim2_;
-    int32_t out_stride_dim0_;
-    int32_t out_stride_dim1_;
-    int32_t out_stride_dim2_;
-    uint8_t data_type_size_;
-};
+#include "src/runtime/kernel/arm/opclib/depth_to_space_parameter.h"

 void DepthToSpaceForNHWC(const void *input, void *output, int *in_shape, DepthToSpaceParameter *param);
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_DEPTH_TO_SPACE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/depth_to_space_parameter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/depth_to_space_parameter.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_DEPTH_TO_SPACE_PARAMETER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_DEPTH_TO_SPACE_PARAMETER_H_
+#include "src/runtime/kernel/arm/opclib/op_base.h"
+
+struct DepthToSpaceParameter {
+    OpParameter op_parameter_;
+    int32_t block_size_;
+    int32_t in_stride_dim0_;
+    int32_t in_stride_dim1_;
+    int32_t in_stride_dim2_;
+    int32_t out_stride_dim0_;
+    int32_t out_stride_dim1_;
+    int32_t out_stride_dim2_;
+    uint8_t data_type_size_;
+};
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_DEPTH_TO_SPACE_PARAMETER_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arg_min_max.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arg_min_max.cc
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arg_min_max_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arg_min_max_int8.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/opclib/int8/arg_min_max_int8.h"
+#include <float.h>
+
+void CalcParameter(const int *shape, int dims_number, int axis, int *pre_axis_count, int *axis_count,
+                      int *after_axis_count) {
+  *pre_axis_count = 1;
+  for (int i = 0; i < axis; ++i) {
+    *pre_axis_count = (*pre_axis_count) * shape[i];
+  }
+
+  *axis_count = shape[axis];
+
+  *after_axis_count = 1;
+  for (int i = axis + 1; i < dims_number; ++i) {
+    *after_axis_count = (*after_axis_count) * shape[i];
+  }
+}
+
+void ArgMinMaxQuant(const int8_t *input, int8_t *output, ArgMinMaxParameter *param, int pre_axis_count, int axis_count,
+            int after_axis_count, QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  bool out_value = param->out_value_;
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float bias = -in_quant_arg->zp_ * in_quant_arg->scale_;
+  int32_t output_zp = out_quant_arg->zp_;
+  for (int i = 0; i < pre_axis_count; ++i) {
+    size_t output_offset = i * after_axis_count;
+    size_t input_offset = output_offset * axis_count;
+    for (int j = 0; j < after_axis_count; ++j) {
+      float value = -FLT_MAX;
+      if (!param->get_max_) {
+        value = FLT_MAX;
+      }
+      float index = 0.0f;
+      for (int k = 0; k < axis_count; ++k) {
+        float value_tmp = input[input_offset + k * after_axis_count + j] * in_quant_arg->scale_ + bias;
+        if (param->get_max_) {
+          if (value_tmp > value) {
+            value = value_tmp;
+            index = k;
+          }
+        } else {
+          if (value_tmp < value) {
+            value = value_tmp;
+            index = k;
+          }
+        }
+      }
+      float real_out = out_value ? value : index;
+      output[output_offset + j] = real_out * output_inverse_scale + output_zp;
+    }
+  }
+}
+
+void ArgMinMaxQuant(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                    QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  int pre_axis_count = 1;
+  int axis_count = 1;
+  int after_axis_count = 1;
+  CalcParameter(in_shape, param->dims_size_, param->axis_, &pre_axis_count, &axis_count, &after_axis_count);
+  ArgMinMaxQuant(input, output, param, pre_axis_count, axis_count, after_axis_count, in_quant_arg, out_quant_arg);
+  return;
+}
+
+int ArgCompareAscInt8(const void *a, const void *b) {
+  return reinterpret_cast<const ArgElement *>(a)->data_.f_data_
+    - reinterpret_cast<const ArgElement *>(b)->data_.f_data_;
+}
+
+int ArgCompareDescInt8(const void *a, const void *b) {
+  return reinterpret_cast<const ArgElement *>(b)->data_.f_data_
+    - reinterpret_cast<const ArgElement *>(a)->data_.f_data_;
+}
+
+int8_t GetInt8Output(float real_out, float output_inverse_scale, int32_t output_zp) {
+  return real_out * output_inverse_scale + output_zp;
+}
+
+void ArgMinMaxDim0(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  bool out_value = param->out_value_;
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float bias = -in_quant_arg->zp_ * in_quant_arg->scale_;
+  int32_t output_zp = out_quant_arg->zp_;
+  for (int32_t i = 0; i < param->in_strides_[0]; ++i) {
+    for (int j = 0; j < in_shape[0]; ++j) {
+      size_t offset = param->in_strides_[0] * j + i;
+      param->arg_elements_[j].index_ = j;
+      param->arg_elements_[j].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
+    }
+    if (param->get_max_) {
+      qsort(param->arg_elements_, in_shape[0], sizeof(ArgElement), ArgCompareDescInt8);
+    } else {
+      qsort(param->arg_elements_, in_shape[0], sizeof(ArgElement), ArgCompareAscInt8);
+    }
+
+    for (int j = 0; j < param->topk_; ++j) {
+      size_t out_offset = j * param->out_strides_[0] + i;
+      float real_out = out_value ? param->arg_elements_[j].data_.f_data_ : param->arg_elements_[j].index_;
+      output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
+    }
+  }
+}
+
+void ArgMinMaxDim1(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  bool out_value = param->out_value_;
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float bias = -in_quant_arg->zp_ * in_quant_arg->scale_;
+  int32_t output_zp = out_quant_arg->zp_;
+  int in_shape1 = in_shape[1];
+  for (int i = 0; i < in_shape[0]; ++i) {
+    size_t in_dim0_offset = i * param->in_strides_[0];
+    size_t out_dim0_offset = i * param->out_strides_[0];
+    for (int j = 0; j < param->in_strides_[1]; ++j) {
+      for (int k = 0; k < in_shape1; ++k) {
+        size_t offset = param->in_strides_[1] * k + in_dim0_offset + j;
+        param->arg_elements_[k].index_ = k;
+        param->arg_elements_[k].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
+      }
+      if (param->get_max_) {
+        qsort(param->arg_elements_, in_shape1, sizeof(ArgElement), ArgCompareDescInt8);
+      } else {
+        qsort(param->arg_elements_, in_shape1, sizeof(ArgElement), ArgCompareAscInt8);
+      }
+
+      for (int k = 0; k < param->topk_; ++k) {
+        size_t out_offset = out_dim0_offset + j + k * param->out_strides_[1];
+        float real_out = out_value ? param->arg_elements_[k].data_.f_data_ : param->arg_elements_[k].index_;
+        output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
+      }
+    }
+  }
+}
+
+void ArgMinMaxDim2(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  bool out_value = param->out_value_;
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float bias = -in_quant_arg->zp_ * in_quant_arg->scale_;
+  int32_t output_zp = out_quant_arg->zp_;
+  int in_shape1 = in_shape[1];
+  int in_shape2 = in_shape[2];
+  for (int i = 0; i < in_shape[0]; ++i) {
+    size_t in_dim0_offset = i * param->in_strides_[0];
+    size_t out_dim0_offset = i * param->out_strides_[0];
+    for (int j = 0; j < in_shape1; ++j) {
+      size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
+      size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
+      for (int k = 0; k < param->in_strides_[2]; ++k) {
+        for (int l = 0; l < in_shape2; ++l) {
+          size_t offset = param->in_strides_[2] * l + k + in_dim1_offset;
+          param->arg_elements_[l].index_ = l;
+          param->arg_elements_[l].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
+        }
+        if (param->get_max_) {
+          qsort(param->arg_elements_, in_shape2, sizeof(ArgElement), ArgCompareDescInt8);
+        } else {
+          qsort(param->arg_elements_, in_shape2, sizeof(ArgElement), ArgCompareAscInt8);
+        }
+        for (int l = 0; l < param->topk_; ++l) {
+          size_t out_offset = out_dim1_offset + k + l * param->out_strides_[2];
+          float real_out = out_value ? param->arg_elements_[l].data_.f_data_ : param->arg_elements_[l].index_;
+          output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
+        }
+      }
+    }
+  }
+}
+
+void ArgMinMaxDim3(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  bool out_value = param->out_value_;
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float bias = -in_quant_arg->zp_ * in_quant_arg->scale_;
+  int32_t output_zp = out_quant_arg->zp_;
+  int in_shape1 = in_shape[1];
+  int in_shape2 = in_shape[2];
+  int in_shape3 = in_shape[3];
+  for (int i = 0; i < in_shape[0]; ++i) {
+    size_t in_dim0_offset = i * param->in_strides_[0];
+    size_t out_dim0_offset = i * param->out_strides_[0];
+    for (int j = 0; j < in_shape1; ++j) {
+      size_t in_dim1_offset = j * param->in_strides_[1] + in_dim0_offset;
+      size_t out_dim1_offset = j * param->out_strides_[1] + out_dim0_offset;
+      for (int k = 0; k < in_shape2; ++k) {
+        size_t in_dim2_offset = k * param->in_strides_[2] + in_dim1_offset;
+        size_t out_dim2_offset = k * param->out_strides_[2] + out_dim1_offset;
+        for (int l = 0; l < in_shape3; ++l) {
+          size_t offset = l + in_dim2_offset;
+          param->arg_elements_[l].index_ = l;
+          param->arg_elements_[l].data_.f_data_ = input[offset] * in_quant_arg->scale_ + bias;
+        }
+        if (param->get_max_) {
+          qsort(param->arg_elements_, in_shape3, sizeof(ArgElement), ArgCompareDescInt8);
+        } else {
+          qsort(param->arg_elements_, in_shape3, sizeof(ArgElement), ArgCompareAscInt8);
+        }
+        for (int l = 0; l < param->topk_; ++l) {
+          size_t out_offset = out_dim2_offset + l;
+          float real_out = out_value ? param->arg_elements_[l].data_.f_data_ : param->arg_elements_[l].index_;
+          output[out_offset] = GetInt8Output(real_out, output_inverse_scale, output_zp);
+        }
+      }
+    }
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arg_min_max.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arg_min_max.h
@@ -13,21 +13,20 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARG_MIN_MAX_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARG_MIN_MAX_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARG_MIN_MAX_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARG_MIN_MAX_INT8_H_

 #include "src/runtime/kernel/arm/opclib/arg_min_max_parameter.h"
+#include "src/runtime/kernel/arm/opclib/quantization/quantize.h"

-void ArgMax(const int8_t *input, int8_t *output, ArgMinMaxParameter *param, int pre_axis_count, int axis_count,
-            int after_axis_count);
-void ArgMin(const int8_t *input, int8_t *output, ArgMinMaxParameter *param, int pre_axis_count, int axis_count,
-            int after_axis_count);
-void ArgMaxDim0(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-void ArgMinDim0(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-void ArgMaxDim1(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-void ArgMinDim1(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-void ArgMaxDim2(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-void ArgMinDim2(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-void ArgMaxDim3(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-void ArgMinDim3(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param);
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARG_MIN_MAX_H_
+void ArgMinMaxQuant(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                    QuantArg *in_quant, QuantArg *out_quant);
+void ArgMinMaxDim0(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant, QuantArg *out_quant);
+void ArgMinMaxDim1(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant, QuantArg *out_quant);
+void ArgMinMaxDim2(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant, QuantArg *out_quant);
+void ArgMinMaxDim3(const int8_t *input, int8_t *output, const int *in_shape, ArgMinMaxParameter *param,
+                   QuantArg *in_quant, QuantArg *out_quant);
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARG_MIN_MAX_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arithmetic_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arithmetic_int8.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/opclib/int8/arithmetic_int8.h"
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include "src/runtime/kernel/arm/opclib/errorcode.h"
+
+int ElementNotEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size) {
+  for (int index = 0; index < element_size; ++index) {
+    output[index] = (int8_t)(input0[index] != input1[index]);
+  }
+  return OPCLIB_OK;
+}
+
+int ElementEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size) {
+  for (int index = 0; index < element_size; ++index) {
+    output[index] = (int8_t)(input0[index] == input1[index]);
+  }
+  return OPCLIB_OK;
+}
+
+int ElementLess(int8_t *input0, int8_t *input1, int8_t *output, int element_size) {
+  for (int index = 0; index < element_size; ++index) {
+    output[index] = (int8_t)(input0[index] < input1[index]);
+  }
+  return OPCLIB_OK;
+}
+
+int ElementLessEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size) {
+  for (int index = 0; index < element_size; ++index) {
+    output[index] = (int8_t)(input0[index] <= input1[index]);
+  }
+  return OPCLIB_OK;
+}
+
+int ElementGreater(int8_t *input0, int8_t *input1, int8_t *output, int element_size) {
+  for (int index = 0; index < element_size; ++index) {
+    output[index] = (int8_t)(input0[index] > input1[index]);
+  }
+  return OPCLIB_OK;
+}
+
+int ElementGreaterEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size) {
+  for (int index = 0; index < element_size; ++index) {
+    output[index] = (int8_t)(input0[index] >= input1[index]);
+  }
+  return OPCLIB_OK;
+}
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arithmetic_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/arithmetic_int8.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARITHMETIC_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARITHMETIC_INT8_H_
+
+#include "src/runtime/kernel/arm/opclib/op_base.h"
+
+int ElementNotEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size);
+
+int ElementEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size);
+
+int ElementLess(int8_t *input0, int8_t *input1, int8_t *output, int element_size);
+
+int ElementLessEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size);
+
+int ElementGreater(int8_t *input0, int8_t *input1, int8_t *output, int element_size);
+
+int ElementGreaterEqual(int8_t *input0, int8_t *input1, int8_t *output, int element_size);
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_ARITHMETIC_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/batch_to_space_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/batch_to_space_int8.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/opclib/int8/batch_to_space_int8.h"
+#include "src/runtime/kernel/arm/opclib/arithmetic_common.h"
+
+void BatchToSpaceNoCropForNHWC(const int8_t *input, int8_t *output, const int *in_shape, int out_n, const int *block,
+                               QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  int block_h = block[0];
+  int block_w = block[1];
+  int in_h = in_shape[1];
+  int in_w = in_shape[2];
+  int in_c = in_shape[3];
+  size_t stride_h = block_w * out_n;
+  size_t output_offset = 0;
+  size_t in_stride_h = in_w * in_c;
+  size_t in_stride_n = in_stride_h * in_h;
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float scale = in_quant_arg->scale_ * output_inverse_scale;
+  float bias = -in_quant_arg->zp_ * scale;
+  int32_t output_zp = out_quant_arg->zp_;
+
+  for (int n = 0; n < out_n; ++n) {
+    for (int h = 0; h < in_h; ++h) {
+      size_t h_offset = h * in_stride_h;
+      for (int bh = 0; bh < block_h; ++bh) {
+        for (int w = 0; w < in_w; ++w) {
+          size_t w_offset = w * in_c;
+          for (int bw = 0; bw < block_w; ++bw) {
+            size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
+            for (int c = 0; c < in_c; ++c) {
+              int32_t output_tmp = round(input[in_offset + c] * scale + bias) + output_zp;
+              output_tmp = output_tmp > 127 ? 127 : output_tmp;
+              output_tmp = output_tmp < -128 ? -128 : output_tmp;
+              output[output_offset++] = output_tmp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void BatchToSpaceForNHWC(const int8_t *input, int8_t *output, const int *in_shape, int out_n, const int *block,
+                         const int *crops, QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  int block_h = block[0];
+  int block_w = block[1];
+  int in_n = in_shape[0];
+  int in_h = in_shape[1];
+  int in_w = in_shape[2];
+  int in_c = in_shape[3];
+  int h_start = crops[0] / block_h;
+  int h_valid_begin = crops[0];
+  int h_end = MSMIN((in_h * block_h - crops[1]) / block_h + 1, in_h);
+  int h_valid_end = in_h * block_h - crops[1] - 1;
+  int w_start = crops[2] / block_w;
+  int w_valid_begin = crops[2];
+  int w_end = MSMIN((in_w * block_w - crops[3]) / block_w + 1, in_w);
+  int w_valid_end = in_w * block_w - crops[3] - 1;
+
+  size_t stride_h = block_w * out_n;
+  size_t output_offset = 0;
+  size_t in_stride_h = in_w * in_c;
+  size_t in_stride_n = in_stride_h * in_h;
+
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float scale = in_quant_arg->scale_ * output_inverse_scale;
+  float bias = -in_quant_arg->zp_ * scale;
+  int32_t output_zp = out_quant_arg->zp_;
+
+  for (int n = 0; n < out_n; ++n) {
+    for (int h = h_start; h < h_end; ++h) {
+      size_t h_offset = h * in_stride_h;
+      for (int bh = 0; bh < block_h; ++bh) {
+        size_t h_index = h * block_h + bh;
+        if (h_index < h_valid_begin || h_index > h_valid_end) {
+          continue;
+        }
+        for (int w = w_start; w < w_end; ++w) {
+          size_t w_offset = w * in_c;
+          for (int bw = 0; bw < block_w; ++bw) {
+            size_t w_index = w * block_w + bw;
+            if (w_index < w_valid_begin || w_index > w_valid_end) {
+              continue;
+            }
+            size_t in_offset = in_stride_n * (bh * stride_h + bw * out_n + n) + w_offset + h_offset;
+            for (int c = 0; c < in_c; ++c) {
+              int32_t output_tmp = round(input[in_offset + c] * scale + bias) + output_zp;
+              output_tmp = output_tmp > 127 ? 127 : output_tmp;
+              output_tmp = output_tmp < -128 ? -128 : output_tmp;
+              output[output_offset++] = output_tmp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/batch_to_space_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/batch_to_space_int8.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_BATCH_TO_SPACE_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_BATCH_TO_SPACE_INT8_H_
+#include "src/runtime/kernel/arm/opclib/op_base.h"
+#include "src/runtime/kernel/arm/opclib/quantization/quantize.h"
+
+void BatchToSpaceNoCropForNHWC(const int8_t *input, int8_t *output, const int *in_shape, int out_n, const int *block,
+                               QuantArg *in_quant_arg, QuantArg *out_quant_arg);
+void BatchToSpaceForNHWC(const int8_t *input, int8_t *output, const int *in_shape, int out_n, const int *block,
+                         const int *crops, QuantArg *in_quant_arg, QuantArg *out_quant_arg);
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_BATCH_TO_SPACE_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/depth_to_space_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/depth_to_space_int8.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/kernel/arm/opclib/int8/depth_to_space_int8.h"
+#include <string.h>
+
+void DepthToSpaceForNHWC(const int8_t *input, int8_t *output, int *in_shape, DepthToSpaceParameter *param,
+                         QuantArg *in_quant_arg, QuantArg *out_quant_arg) {
+  int32_t block_size = param->block_size_;
+  int32_t in_shape_dim2 = in_shape[2];
+  int32_t in_shape_dim1 = in_shape[1];
+  size_t copy_size = block_size * param->out_stride_dim2_;
+  float output_inverse_scale = 1.f / out_quant_arg->scale_;
+  float scale = in_quant_arg->scale_ * output_inverse_scale;
+  float bias = -in_quant_arg->zp_ * scale;
+  int32_t output_zp = out_quant_arg->zp_;
+  for (int i = 0; i < in_shape[0]; ++i) {
+    size_t in_offset_n = i * param->in_stride_dim0_;
+    size_t out_offset_n = i * param->out_stride_dim0_;
+    for (int j = 0; j < in_shape_dim1; ++j) {
+      size_t in_offset_h = in_offset_n + j * param->in_stride_dim1_;
+      size_t out_offset_h = out_offset_n + j * block_size * param->out_stride_dim1_;
+      for (int k = 0; k < in_shape_dim2; ++k) {
+        size_t in_offset_w = in_offset_h + k * param->in_stride_dim2_;
+        size_t out_offset_w = out_offset_h + k * block_size * param->out_stride_dim2_;
+        for (int l = 0; l < block_size; ++l) {
+          size_t out_offset = out_offset_w + l * param->out_stride_dim1_;
+          size_t in_offset = in_offset_w + l * block_size * param->out_stride_dim2_;
+          for (int m = 0; m < copy_size; ++m) {
+            int32_t output_tmp = round(input[in_offset + m] * scale + bias) + output_zp;
+            output_tmp = output_tmp > 127 ? 127 : output_tmp;
+            output_tmp = output_tmp < -128 ? -128 : output_tmp;
+            output[out_offset + m] = output_tmp;
+          }
+        }
+      }
+    }
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/depth_to_space_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/depth_to_space_int8.h
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_DEPTH_TO_SPACE_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_DEPTH_TO_SPACE_INT8_H_
+
+#include "src/runtime/kernel/arm/opclib/depth_to_space_parameter.h"
+#include "src/runtime/kernel/arm/opclib/quantization/quantize.h"
+
+void DepthToSpaceForNHWC(const int8_t *input, int8_t *output, int *in_shape, DepthToSpaceParameter *param,
+                         QuantArg *in_quant_arg, QuantArg *out_quant_arg);
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_INT8_DEPTH_TO_SPACE_INT8_H_
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/argminmax_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/argminmax_fp32_test.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arg_min_max.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/arg_min_max.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/arithmetic_common.h"
+
+namespace mindspore {
+
+class TestArgMinMaxTestFp32 : public mindspore::Common {
+ public:
+  TestArgMinMaxTestFp32() = default;
+};
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest1) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {2, 2, 0, 2, 0};
+  std::vector<int> shape = {3, 5};
+  float out[5];
+  ArgMinMaxParameter param;
+  param.topk_ = 1;
+  param.out_value_ = false;
+  param.axis_ = 0;
+  param.data_type_ = 43;
+  param.dims_size_ = 2;
+  param.get_max_ = true;
+  ArgMinMax(in.data(), out, shape.data(), &param);
+  for (size_t i = 0; i < except_out.size(); ++i) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.000001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest2) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {30, 45, 30, 50, 90};
+  std::vector<int> shape = {3, 5};
+  float out[5];
+  ArgMinMaxParameter param;
+  param.topk_ = 1;
+  param.out_value_ = true;
+  param.axis_ = 0;
+  param.data_type_ = 43;
+  param.dims_size_ = 2;
+  param.get_max_ = true;
+  ArgMinMax(in.data(), out, shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.000001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMinTest2) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {10, 11, 15, 1, 30};
+  std::vector<int> shape = {3, 5};
+  float out[5];
+  ArgMinMaxParameter param;
+  param.topk_ = 1;
+  param.out_value_ = true;
+  param.axis_ = 0;
+  param.data_type_ = 43;
+  param.dims_size_ = 2;
+  param.get_max_ = false;
+  ArgMinMax(in.data(), out, shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.000001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest3_axis2_out_data) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {30, 45, 30, 50, 90, 20, 20, 25, 40, 50};
+  ArgMinMaxParameter param;
+  param.axis_ = 2;
+  std::vector<int> in_shape = {1, 1, 3, 5};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = true;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {1, 1, 2, 5};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[10];
+  ArgMaxDim2(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest3_axis2_out_index) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {2, 2, 0, 2, 0, 1, 0, 2, 0, 1};
+  ArgMinMaxParameter param;
+  param.axis_ = 2;
+  std::vector<int> in_shape = {1, 1, 3, 5};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = false;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {1, 1, 2, 5};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[10];
+  ArgMaxDim2(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest4_axis3_out_data) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {90, 40,
+                                   50, 20,
+                                   50, 45};
+  ArgMinMaxParameter param;
+  param.axis_ = 3;
+  std::vector<int> in_shape = {1, 1, 3, 5};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = true;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {1, 1, 3, 2};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[6];
+  ArgMaxDim3(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest4_axis3_out_index) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {4, 3,
+                                   4, 0,
+                                   3, 1};
+  ArgMinMaxParameter param;
+  param.axis_ = 3;
+  std::vector<int> in_shape = {1, 1, 3, 5};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = false;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {1, 1, 3, 2};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[6];
+  ArgMaxDim3(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest5_axis1_out_index) {
+  std::vector<float> in = {100, 2, 300,
+                            4,  50, 6,
+                            11, 12, 13,
+                            34, 35, 36,
+                            9,  6,  17,
+                            10, 20, 30,
+                            10, 20, 30,
+                            40, 5,  60,
+                            7,  80, 90,
+                            10, 11, 120,
+                            18, 5,  16,
+                            9,  22, 23};
+  std::vector<float> except_out = {0, 1, 0,
+                                   1, 0, 1,
+                                   1, 2, 2,
+                                   2, 1, 2,
+                                   2, 1, 1,
+                                   0, 2, 1,
+                                   0, 0, 0,
+                                   1, 1, 0};
+  ArgMinMaxParameter param;
+  param.axis_ = 1;
+  std::vector<int> in_shape = {2, 3, 2, 3};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = false;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {2, 2, 2, 3};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[24];
+  ArgMaxDim1(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest5_axis1_out_data) {
+  std::vector<float> in = {100, 2, 300,
+                            4,  50, 6,
+                            11, 12, 13,
+                            34, 35, 36,
+                            9,  6,  17,
+                            10, 20, 30,
+                            10, 20, 30,
+                            40, 5,  60,
+                            7,  80, 90,
+                            10, 11, 120,
+                            18, 5,  16,
+                            9,  22, 23};
+  std::vector<float> except_out = {100, 12, 300,
+                                   34, 50, 36,
+                                   11, 6, 17,
+                                   10, 35, 30,
+                                   18, 80, 90,
+                                   40, 22, 120,
+                                   10, 20, 30,
+                                   10, 11, 60};
+  ArgMinMaxParameter param;
+  param.axis_ = 1;
+  std::vector<int> in_shape = {2, 3, 2, 3};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = true;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {2, 2, 2, 3};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[24];
+  ArgMaxDim1(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest6_axis0_out_index) {
+  std::vector<float> in = {100, 2,
+                            4,  50,
+                            11, 12,
+                            34, 35,
+                            10, 20,
+                            40, 5,
+                            7,  80,
+                            10, 11,
+                            55, 25,
+                            5,  15,
+                            18, 8,
+                            15, 16};
+  std::vector<float> except_out = {0, 2,
+                                   1, 0,
+                                   2, 1,
+                                   0, 0,
+                                   2, 1,
+                                   2, 2,
+                                   0, 0,
+                                   2, 2};
+  ArgMinMaxParameter param;
+  param.axis_ = 1;
+  std::vector<int> in_shape = {3, 2, 2, 2};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = false;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {2, 2, 2, 2};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[16];
+  ArgMaxDim0(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMaxTest6_axis0_out_data) {
+  std::vector<float> in = {100, 2,
+                            4,  50,
+                            11, 12,
+                            34, 35,
+                            10, 20,
+                            40, 5,
+                            7,  80,
+                            10, 11,
+                            55, 25,
+                            5,  15,
+                            18, 8,
+                            15, 16};
+  std::vector<float> except_out = {100, 25,
+                                   40, 50,
+                                   18, 80,
+                                   34, 35,
+                                   55, 20,
+                                   5, 15,
+                                   11, 12,
+                                   15, 16};
+  ArgMinMaxParameter param;
+  param.axis_ = 1;
+  std::vector<int> in_shape = {3, 2, 2, 2};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = true;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {2, 2, 2, 2};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[16];
+  ArgMaxDim0(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+TEST_F(TestArgMinMaxTestFp32, ArgMinTest1_axis3_out_data) {
+  std::vector<float> in = {10, 20, 30, 40, 90,
+                           20, 11, 15, 1,  50,
+                           30, 45, 25, 50, 30};
+  std::vector<float> except_out = {10, 20,
+                                   1,  11,
+                                   25, 30};
+  ArgMinMaxParameter param;
+  param.axis_ = 3;
+  std::vector<int> in_shape = {1, 1, 3, 5};
+  param.arg_elements_ = reinterpret_cast<ArgElement *>(malloc(in_shape[param.axis_] * sizeof(ArgElement)));
+  param.out_value_ = true;
+  param.topk_ = 2;
+  std::vector<int> out_shape = {1, 1, 3, 2};
+  ComputeStrides(in_shape.data(), param.in_strides_, in_shape.size());
+  ComputeStrides(out_shape.data(), param.out_strides_, out_shape.size());
+  float out[6];
+  ArgMinDim3(in.data(), out, in_shape.data(), &param);
+  CompareOutputData(out, except_out.data(), except_out.size(), 0.00001);
+}
+
+}  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batch_to_space_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/batch_to_space_fp32_test.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/batch_to_space.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/arithmetic_common.h"
+
+namespace mindspore {
+
+class BatchToSpaceTestFp32 : public mindspore::Common {
+ public:
+  BatchToSpaceTestFp32() = default;
+};
+
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest1) {
+  float input[12] = {10, 30, 90, 2, 20, 120, 5, 50, 150, 6, 16, 160};
+  constexpr int kOutSize = 12;
+  float expect_out[kOutSize] = {10, 30, 90, 2, 20, 120, 5, 50, 150, 6, 16, 160};
+
+  float output[kOutSize];
+  int in_shape[4] = {4, 1, 1, 3};
+  int out_n = 1;
+  int block[2] = {2, 2};
+  BatchToSpaceNoCropForNHWC(input, output, in_shape, out_n, block, sizeof(float));
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest_crop_1) {
+  float input[12] = {10, 30, 90, 2, 20, 120, 5, 50, 150, 6, 16, 160};
+  constexpr int kOutSize = 3;
+  float expect_out[kOutSize] = {5, 50, 150};
+
+  float output[kOutSize];
+  int in_shape[4] = {4, 1, 1, 3};
+  int out_n = 1;
+  int block[2] = {2, 2};
+  int crops[4] = {1, 0, 0, 1};
+  BatchToSpaceForNHWC(input, output, in_shape, out_n, block, crops, sizeof(float));
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest2) {
+  float input[32] = {1, 10, 3, 30, 9,  90,  11, 110, 2, 20, 4, 40, 10, 100, 12, 120,
+                     5, 50, 7, 70, 13, 130, 15, 150, 6, 60, 8, 80, 14, 140, 16, 160};
+  constexpr int kOutSize = 32;
+  float expect_out[kOutSize] = {1, 10, 2,  20,  3,  30,  4,  40,  5,  50,  6,  60,  7,  70,  8,  80,
+                                9, 90, 10, 100, 11, 110, 12, 120, 13, 130, 14, 140, 15, 150, 16, 160};
+
+  float output[kOutSize];
+  int in_shape[4] = {4, 2, 2, 2};
+  int out_n = 1;
+  int block[2] = {2, 2};
+  BatchToSpaceNoCropForNHWC(input, output, in_shape, out_n, block, sizeof(float));
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest_crop_2) {
+  float input[32] = {1, 10, 3, 30, 9,  90,  11, 110, 2, 20, 4, 40, 10, 100, 12, 120,
+                     5, 50, 7, 70, 13, 130, 15, 150, 6, 60, 8, 80, 14, 140, 16, 160};
+  constexpr int kOutSize = 12;
+  float expect_out[kOutSize] = {6,  60,  7,  70,  8,  80,
+                                10, 100, 11, 110, 12, 120};
+
+  float output[kOutSize];
+  int in_shape[4] = {4, 2, 2, 2};
+  int out_n = 1;
+  int block[2] = {2, 2};
+  int crops[4] = {1, 1, 1, 0};
+  BatchToSpaceForNHWC(input, output, in_shape, out_n, block, crops, sizeof(float));
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest3) {
+  float input[64] = {1,  10, 3,  30, 9,   90,  11,  110, 2,  20, 4,  40, 10,  100, 12,  120,
+                     5,  50, 7,  70, 13,  130, 15,  150, 6,  60, 8,  80, 14,  140, 16,  160,
+                     21, 10, 23, 30, 29,  90,  211, 110, 22, 20, 24, 40, 210, 100, 212, 120,
+                     25, 50, 27, 70, 213, 130, 215, 150, 26, 60, 28, 80, 214, 140, 216, 160};
+  constexpr int kOutSize = 64;
+  float expect_out[kOutSize] = {1,  10,  5,  50,  3,  30,  7,  70,  21,  10,  25,  50,  23,  30,  27,  70,
+                                9,  90,  13, 130, 11, 110, 15, 150, 29,  90,  213, 130, 211, 110, 215, 150,
+                                2,  20,  6,  60,  4,  40,  8,  80,  22,  20,  26,  60,  24,  40,  28,  80,
+                                10, 100, 14, 140, 12, 120, 16, 160, 210, 100, 214, 140, 212, 120, 216, 160};
+
+  float output[kOutSize];
+  int in_shape[4] = {8, 2, 2, 2};
+  int out_n = 2;
+  int block[2] = {2, 2};
+  BatchToSpaceNoCropForNHWC(input, output, in_shape, out_n, block, sizeof(float));
+  for (int i = 0; i < kOutSize && i < 32; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest_crop_3) {
+  float input[64] = {1,  10, 3,  30, 9,   90,  11,  110, 2,  20, 4,  40, 10,  100, 12,  120,
+                     5,  50, 7,  70, 13,  130, 15,  150, 6,  60, 8,  80, 14,  140, 16,  160,
+                     21, 10, 23, 30, 29,  90,  211, 110, 22, 20, 24, 40, 210, 100, 212, 120,
+                     25, 50, 27, 70, 213, 130, 215, 150, 26, 60, 28, 80, 214, 140, 216, 160};
+  constexpr int kOutSize = 16;
+  float expect_out[kOutSize] = {9,  90,  13, 130, 29,  90,  213, 130,
+                                10, 100, 14, 140, 210, 100, 214, 140};
+
+  float output[kOutSize];
+  int in_shape[4] = {8, 2, 2, 2};
+  int out_n = 2;
+  int block[2] = {2, 2};
+  int crops[4] = {2, 0, 0, 2};
+  BatchToSpaceForNHWC(input, output, in_shape, out_n, block, crops, sizeof(float));
+  for (int i = 0; i < kOutSize && i < 32; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest4) {
+  float input[96] = {1,   10,  3,   30,  9,   90,  11,  110, 2,  20,  4,   40,  10,  100, 12,  120, 5,   50,  7,   70,
+                     13,  130, 15,  150, 6,   60,  8,   80,  14, 140, 16,  160, 21,  10,  23,  30,  29,  90,  211, 110,
+                     22,  20,  24,  40,  210, 100, 212, 120, 25, 50,  27,  70,  213, 130, 215, 150, 26,  60,  28,  80,
+                     214, 140, 216, 160, 31,  10,  33,  30,  39, 90,  311, 110, 32,  20,  34,  40,  310, 100, 312, 120,
+                     35,  50,  37,  70,  313, 130, 315, 150, 36, 60,  38,  80,  314, 140, 316, 160};
+  constexpr int kOutSize = 96;
+  float expect_out[kOutSize] = {
+    1,  10,  5,  50,  3,  30,  7,  70,  21,  10,  25,  50,  23,  30,  27,  70,  31,  10,  35,  50,  33,  30,  37,  70,
+    9,  90,  13, 130, 11, 110, 15, 150, 29,  90,  213, 130, 211, 110, 215, 150, 39,  90,  313, 130, 311, 110, 315, 150,
+    2,  20,  6,  60,  4,  40,  8,  80,  22,  20,  26,  60,  24,  40,  28,  80,  32,  20,  36,  60,  34,  40,  38,  80,
+    10, 100, 14, 140, 12, 120, 16, 160, 210, 100, 214, 140, 212, 120, 216, 160, 310, 100, 314, 140, 312, 120, 316, 160};
+
+  float output[kOutSize];
+  int in_shape[4] = {12, 2, 2, 2};
+  int out_n = 2;
+  int block[2] = {3, 2};
+  BatchToSpaceNoCropForNHWC(input, output, in_shape, out_n, block, sizeof(float));
+  for (int i = 0; i < kOutSize && i < 32; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(BatchToSpaceTestFp32, BatchToSpaceTest_crop_4) {
+  float input[96] = {1,   10,  3,   30,  9,   90,  11,  110, 2,  20,  4,   40,  10,  100, 12,  120, 5,   50,  7,   70,
+                     13,  130, 15,  150, 6,   60,  8,   80,  14, 140, 16,  160, 21,  10,  23,  30,  29,  90,  211, 110,
+                     22,  20,  24,  40,  210, 100, 212, 120, 25, 50,  27,  70,  213, 130, 215, 150, 26,  60,  28,  80,
+                     214, 140, 216, 160, 31,  10,  33,  30,  39, 90,  311, 110, 32,  20,  34,  40,  310, 100, 312, 120,
+                     35,  50,  37,  70,  313, 130, 315, 150, 36, 60,  38,  80,  314, 140, 316, 160};
+  constexpr int kOutSize = 24;
+  float expect_out[kOutSize] = {
+    25,  50,  23,  30,  35,  50,  33,  30,
+    13, 130, 11, 110,  26,  60,  24,  40, 36,  60,  34,  40, 14, 140, 12, 120};
+
+  float output[kOutSize];
+  int in_shape[4] = {12, 2, 2, 2};
+  int out_n = 2;
+  int block[2] = {3, 2};
+  int crops[4] = {1, 2, 1, 1};
+  BatchToSpaceForNHWC(input, output, in_shape, out_n, block, crops, sizeof(float));
+  for (int i = 0; i < kOutSize && i < 32; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+}  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/crop_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/crop_fp32_test.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/common_test.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/crop.h"
+
+namespace mindspore {
+class CropTestFp32 : public mindspore::Common {
+ public:
+  CropTestFp32() = default;
+};
+
+TEST_F(CropTestFp32, CropTest1) {
+  float input[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const int kOutSize = 2;
+  float expect_out[kOutSize] = {8, 16};
+
+  float output[kOutSize];
+  int in_shape[4] = {2, 2, 2, 2};
+  int out_shape[4] = {2, 1, 1, 1};
+  CropParameter crop_param;
+  crop_param.axis_ = 1;
+  crop_param.offset_[0] = 1;
+  crop_param.offset_[1] = 1;
+  crop_param.offset_[2] = 1;
+  crop_param.op_parameter_.thread_num_ = 1;
+  crop_param.thread_id_ = 0;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest2) {
+  float input[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const int kOutSize = 4;
+  float expect_out[kOutSize] = {13, 14, 15, 16};
+
+  float output[kOutSize];
+  int in_shape[4] = {2, 2, 2, 2};
+  int out_shape[4] = {1, 1, 2, 2};
+  CropParameter crop_param;
+  crop_param.axis_ = 0;
+  crop_param.offset_[0] = 1;
+  crop_param.offset_[1] = 1;
+  crop_param.offset_[2] = 0;
+  crop_param.offset_[3] = 0;
+  crop_param.op_parameter_.thread_num_ = 1;
+  crop_param.thread_id_ = 0;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest3) {
+  float input[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const int kOutSize = 8;
+  float expect_out[kOutSize] = {2, 4, 6, 8, 10, 12, 14, 16};
+
+  float output[kOutSize];
+  int in_shape[4] = {2, 2, 2, 2};
+  int out_shape[4] = {2, 2, 2, 1};
+  CropParameter crop_param;
+  crop_param.axis_ = 3;
+  crop_param.offset_[0] = 1;
+  crop_param.op_parameter_.thread_num_ = 1;
+  crop_param.thread_id_ = 0;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest4) {
+  float input[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const int kOutSize = 8;
+  float expect_out[kOutSize] = {2, 4, 6, 8, 10, 12, 14, 16};
+
+  float output[kOutSize];
+  int in_shape[4] = {2, 2, 2, 2};
+  int out_shape[4] = {2, 2, 2, 1};
+  CropParameter crop_param;
+  crop_param.axis_ = 3;
+  crop_param.offset_[0] = 1;
+  crop_param.op_parameter_.thread_num_ = 2;
+  crop_param.thread_id_ = 0;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  crop_param.thread_id_ = 1;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest5) {
+  float input[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const int kOutSize = 2;
+  float expect_out[kOutSize] = {8, 16};
+
+  float output[kOutSize];
+  int in_shape[4] = {2, 2, 2, 2};
+  int out_shape[4] = {2, 1, 1, 1};
+  CropParameter crop_param;
+  crop_param.axis_ = 1;
+  crop_param.offset_[0] = 1;
+  crop_param.offset_[1] = 1;
+  crop_param.offset_[2] = 1;
+  Crop4DNoParallel(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest6) {
+  float input[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const int kOutSize = 4;
+  float expect_out[kOutSize] = {13, 14, 15, 16};
+
+  float output[kOutSize];
+  int in_shape[4] = {2, 2, 2, 2};
+  int out_shape[4] = {1, 1, 2, 2};
+  CropParameter crop_param;
+  crop_param.axis_ = 0;
+  crop_param.offset_[0] = 1;
+  crop_param.offset_[1] = 1;
+  crop_param.offset_[2] = 0;
+  crop_param.offset_[3] = 0;
+  Crop4DNoParallel(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest7) {
+  float input[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const int kOutSize = 8;
+  float expect_out[kOutSize] = {2, 4, 6, 8, 10, 12, 14, 16};
+
+  float output[kOutSize];
+  int in_shape[4] = {2, 2, 2, 2};
+  int out_shape[4] = {2, 2, 2, 1};
+  CropParameter crop_param;
+  crop_param.axis_ = 3;
+  crop_param.offset_[0] = 1;
+  Crop4DNoParallel(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest8) {
+  float input[27] = {1,  2,  3,  4,  5,  6,  7,  8,  9,
+                     11, 12, 13, 14, 15, 16, 17, 18, 19,
+                     21, 22, 23, 24, 25, 26, 27, 28, 29};
+  const int kOutSize = 4;
+  float expect_out[kOutSize] = {15, 16, 18, 19};
+
+  float output[kOutSize];
+  int in_shape[4] = {1, 3, 3, 3};
+  int out_shape[4] = {1, 1, 2, 2};
+  CropParameter crop_param;
+  crop_param.axis_ = 1;
+  crop_param.offset_[0] = 1;
+  crop_param.offset_[1] = 1;
+  crop_param.offset_[2] = 1;
+  crop_param.op_parameter_.thread_num_ = 2;
+  crop_param.thread_id_ = 0;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  crop_param.thread_id_ = 1;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(CropTestFp32, CropTest9) {
+  float input[64] = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                     11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, 113, 114, 115, 116,
+                     21, 22, 23, 24, 25, 26, 27, 28, 29, 210, 211, 212, 213, 214, 215, 216,
+                     31, 32, 33, 34, 35, 36, 37, 38, 39, 310, 311, 312, 313, 314, 315, 316};
+  const int kOutSize = 8;
+  float expect_out[kOutSize] = {16, 17, 110, 111, 26, 27, 210, 211};
+
+  float output[kOutSize];
+  int in_shape[4] = {1, 4, 4, 4};
+  int out_shape[4] = {1, 2, 2, 2};
+  CropParameter crop_param;
+  crop_param.axis_ = 1;
+  crop_param.offset_[0] = 1;
+  crop_param.offset_[1] = 1;
+  crop_param.offset_[2] = 1;
+  crop_param.op_parameter_.thread_num_ = 2;
+  crop_param.thread_id_ = 0;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  crop_param.thread_id_ = 1;
+  Crop4D(input, output, in_shape, out_shape, &crop_param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+}  // namespace mindspore
+
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/depth_to_space_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/depth_to_space_fp32_test.cc
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "mindspore/core/utils/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/depth_to_space.h"
+#include "mindspore/lite/src/runtime/kernel/arm/opclib/arithmetic_common.h"
+
+namespace mindspore {
+
+class DepthToSpaceTestFp32 : public mindspore::Common {
+ public:
+  DepthToSpaceTestFp32() = default;
+};
+
+TEST_F(DepthToSpaceTestFp32, DepthToSpaceTest2) {
+  float input[16] = {1, 2, 10, 20, 5, 6, 3, 8, 18, 10, 11, 55, 3, 4, 15, 25};
+  constexpr int kOutSize = 16;
+  float expect_out[kOutSize] = {1, 2, 5, 6, 10, 20, 3, 8, 18, 10, 3, 4, 11, 55, 15, 25};
+
+  float output[kOutSize];
+  int in_shape[4] = {1, 2, 2, 4};
+  int out_shape[4] = {1, 4, 4, 1};
+  DepthToSpaceParameter param;
+  param.block_size_ = 2;
+  int in_strides[4];
+  ComputeStrides(in_shape, in_strides, 4);
+  int out_strides[4];
+  ComputeStrides(out_shape, out_strides, 4);
+  param.in_stride_dim0_ = in_strides[0];
+  param.in_stride_dim1_ = in_strides[1];
+  param.in_stride_dim2_ = in_strides[2];
+  param.out_stride_dim0_ = out_strides[0];
+  param.out_stride_dim1_ = out_strides[1];
+  param.out_stride_dim2_ = out_strides[2];
+  param.data_type_size_ = sizeof(float);
+  DepthToSpaceForNHWC((const void *)input, output, in_shape, &param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+
+TEST_F(DepthToSpaceTestFp32, DepthToSpaceTest3) {
+  float input[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+  constexpr int kOutSize = 8;
+  float expect_out[kOutSize] = {1, 2, 3, 4, 5, 6, 7, 8};
+
+  float output[kOutSize];
+  int in_shape[4] = {1, 1, 1, 8};
+  int out_shape[4] = {1, 2, 2, 2};
+  DepthToSpaceParameter param;
+  param.block_size_ = 2;
+  int in_strides[4];
+  ComputeStrides(in_shape, in_strides, 4);
+  int out_strides[4];
+  ComputeStrides(out_shape, out_strides, 4);
+  param.in_stride_dim0_ = in_strides[0];
+  param.in_stride_dim1_ = in_strides[1];
+  param.in_stride_dim2_ = in_strides[2];
+  param.out_stride_dim0_ = out_strides[0];
+  param.out_stride_dim1_ = out_strides[1];
+  param.out_stride_dim2_ = out_strides[2];
+  param.data_type_size_ = sizeof(float);
+  DepthToSpaceForNHWC((const void *)input, output, in_shape, &param);
+  for (int i = 0; i < kOutSize; ++i) {
+    std::cout << output[i] << " ";
+  }
+  std::cout << "\n";
+  CompareOutputData(output, expect_out, kOutSize, 0.000001);
+}
+}  // namespace mindspore