diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
index c2ae7c910631fd15dfd9dfeb5f8e2f67a395904a..b9ad6ad6776c50b605781332cd4ec448674175e6 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
@@ -2,39 +2,106 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
 __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output,
-                     int2 input_channels, int4 output_shape) {
-  int X = get_global_id(0);  // H
+                     int4 input_shape0, int4 input_shape1, int4 output_shape, const int axis) {
+  int X = get_global_id(0);  // N*H
   int Y = get_global_id(1);  // W
   int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
+  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
     return;
   }
-  if (Z < input_channels.x) {
-    FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
-    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+  if (axis == 0) {
+    if (X < input_shape0.x * input_shape0.y) {
+      FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    } else {
+      FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.x * input_shape0.y)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    }
+  } else if (axis == 1) {
+    if (X < input_shape0.y) {
+      FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    } else {
+      FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    }
+  } else if (axis == 2) {
+    if (Y < input_shape0.z) {
+      FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    } else {
+      FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    }
   } else {
-    FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
-    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    if (Z < input_shape0.w) {
+      FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    } else {
+      FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
+    }
   }
 }
 
 __kernel void Concat3input(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
-                           __write_only image2d_t output, int3 input_channels, int4 output_shape) {
-  int X = get_global_id(0);  // H
+                           __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
+                           int4 output_shape, const int axis) {
+  int X = get_global_id(0);  // N*H
   int Y = get_global_id(1);  // W
   int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
+  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
     return;
   }
-  if (Z < input_channels.x) {
-    FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X)));
-    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
-  } else if (Z < (input_channels.x + input_channels.y)) {
-    FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X)));
-    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
+  if (axis == 0) {
+    if (X < input_shape0.x * input_shape0.y) {
+      FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
+    } else if (X < (input_shape0.x * input_shape0.y + input_shape1.x * input_shape1.y)) {
+      FLT4 result1 =
+        READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.x * input_shape0.y)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
+    } else {
+      FLT4 result2 = READ_IMAGE(
+        input2, smp_none,
+        (int2)((Y)*input_shape2.w + Z, (X - input_shape0.x * input_shape0.y - input_shape1.x * input_shape1.y)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
+    }
+  } else if (axis == 1) {
+    if (X < input_shape0.y) {
+      FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
+    } else if (X < (input_shape0.y + input_shape1.y)) {
+      FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
+    } else {
+      FLT4 result2 =
+        READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
+    }
+  } else if (axis == 2) {
+    if (Y < input_shape0.z) {
+      FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
+    } else if (Y < (input_shape0.z + input_shape0.z)) {
+      FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
+    } else {
+      FLT4 result2 =
+        READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
+    }
   } else {
-    FLT4 result2 =
-      READ_IMAGE(input2, smp_none, (int2)((Y)*input_channels.z + Z - input_channels.x - input_channels.y, (X)));
-    WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
+    if (Z < input_shape0.w) {
+      FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0);
+    } else if (Z < (input_shape0.w + input_shape0.w)) {
+      FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1);
+    } else {
+      FLT4 result2 =
+        READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X)));
+      WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2);
+    }
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
index ee8eba8ad3fd0212f90a3b33553c6ca80b0c95d0..5c98f9dfe06300d4f432b2dafbf9e95dc93030fb 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@@ -100,7 +100,7 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
   local->push_back(z);
 }
 int BatchNormOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running!";
+  MS_LOG(DEBUG) << this->name() << " Running! ";
   auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   auto input0_shape = in_tensors_[0]->shape();
@@ -136,12 +136,12 @@ kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::tensor:
                                                  const mindspore::lite::PrimitiveC *primitive) {
   auto *kernel = new (std::nothrow) BatchNormOpenCLKernel(opParameter, inputs, outputs);
   if (kernel == nullptr) {
-    MS_LOG(ERROR) << "new BatchnormOpenCLKernel failed";
+    MS_LOG(ERROR) << " new BatchnormOpenCLKernel failed ";
     return nullptr;
   }
   auto ret = kernel->Init();
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
+    MS_LOG(ERROR) << " Init kernel failed, name: Batchnorm ";
     delete kernel;
     return nullptr;
   }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
index 2203508a9728c2852323cc675b106e881d584ade..f76fc5259eeddf261a71726785533dc12c5b8179 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -33,10 +33,10 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
   size_t im_dst_x, im_dst_y;
   if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
     im_dst_x = out_tensors_[0]->Width() * CO4;
-    im_dst_y = out_tensors_[0]->Height();
+    im_dst_y = out_tensors_[0]->Height() * out_tensors_[0]->Batch();
   } else {
     im_dst_y = out_tensors_[0]->Height() * CO4;
-    im_dst_x = out_tensors_[0]->Width();
+    im_dst_x = out_tensors_[0]->Width() * out_tensors_[0]->Batch();
   }
   size_t img_dtype = CL_FLOAT;
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
@@ -51,20 +51,17 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
 }
 int ConcatOpenCLKernel::Init() {
   if (in_tensors_[0]->shape().size() != 4) {
-    MS_LOG(ERROR) << "only support dim=4";
+    MS_LOG(ERROR) << " only support dim = 4 ";
     return RET_ERROR;
   }
 
   auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
-  MS_LOG(DEBUG) << "concat at axis=:  " << param->axis_;
-  if (param->axis_ != 0 && param->axis_ != 3) {
-    MS_LOG(ERROR) << "only support axis=0 or axis=3";
+  MS_LOG(DEBUG) << " concat at axis=:  " << param->axis_;
+  if (param->axis_ < 0 || param->axis_ > 3) {
+    MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 ";
     return RET_ERROR;
   }
 
-  if (param->axis_ == 0) {
-    return RET_OK;
-  }
   if (in_tensors_.size() == 2) {
     std::set<std::string> build_options;
     std::string source = concat_source;
@@ -94,33 +91,6 @@ int ConcatOpenCLKernel::Init() {
 
 int ConcatOpenCLKernel::ReSize() { return RET_OK; }
 
-int ConcatOpenCLKernel::Run_axis0() {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  auto allocator_ = ocl_runtime->GetAllocator();
-  cl::CommandQueue *command_queue = ocl_runtime->GetDefaultCommandQueue();
-
-  for (auto &tensor : in_tensors_) {
-    auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data()));
-    ocl_runtime->MapBuffer(*buffer, CL_MAP_READ, tensor->Size(), command_queue, true);
-  }
-  for (auto &tensor : out_tensors_) {
-    auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data()));
-    ocl_runtime->MapBuffer(*buffer, CL_MAP_WRITE, tensor->Size(), command_queue, true);
-  }
-
-  memcpy(out_tensors_[0]->Data(), in_tensors_[0]->Data(), in_tensors_[0]->Size());
-  memcpy(reinterpret_cast<char *>(out_tensors_[0]->Data()) + in_tensors_[0]->Size(), in_tensors_[1]->Data(),
-         in_tensors_[1]->Size());
-
-  for (auto tensors : {&in_tensors_, &out_tensors_}) {
-    for (auto &tensor : *tensors) {
-      auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data()));
-      ocl_runtime->UnmapBuffer(*buffer, tensor->Data());
-    }
-  }
-  return RET_OK;
-}
-
 int ConcatGetBiggestDividerWithPriority(int number, int max_divider) {
   if (number % 8 == 0 && max_divider >= 8) {
     return number / 8;
@@ -154,21 +124,19 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
   local->push_back(z);
 }
 int ConcatOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running!";
+  MS_LOG(DEBUG) << this->name() << " Running! ";
   auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
-  if (param->axis_ == 0) {
-    return Run_axis0();
-  }
 
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  auto input0_shape = in_tensors_[0]->shape();
-  auto input1_shape = in_tensors_[1]->shape();
+  auto input1_shape = in_tensors_[0]->shape();
+  auto input2_shape = in_tensors_[1]->shape();
   auto output_shape = out_tensors_[0]->shape();
 
-  cl_int2 input0_shape2_ = {UP_DIV(input0_shape[3], C4NUM), UP_DIV(input1_shape[3], C4NUM)};  // change
+  cl_int4 input_shape1_ = {input1_shape[0], input1_shape[1], input1_shape[2], UP_DIV(input1_shape[3], C4NUM)};
+  cl_int4 input_shape2_ = {input2_shape[0], input2_shape[1], input2_shape[2], UP_DIV(input2_shape[3], C4NUM)};
   cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
 
-  uint32_t OH = output_shape[1];  // N*H
+  uint32_t OH = output_shape[0] * output_shape[1];  // N*H
   uint32_t OW = output_shape[2];
   uint32_t OC = UP_DIV(output_shape[3], C4NUM);
 
@@ -182,23 +150,28 @@ int ConcatOpenCLKernel::Run() {
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data());
-    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_);
+    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
+    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
+    ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
   } else if (in_tensors_.size() == 3) {
-    auto input2_shape = in_tensors_[2]->shape();
-    cl_int3 input0_shape3_ = {UP_DIV(input0_shape[3], C4NUM), UP_DIV(input1_shape[3], C4NUM),
-                              UP_DIV(input2_shape[3], C4NUM)};
+    auto input3_shape = in_tensors_[2]->shape();
+    cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)};
+
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->Data());
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data());
-    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape3_);
+    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_);
+    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_);
+    ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_);
     ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_);
+    ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_);
   } else if (in_tensors_.size() < 2) {
-    MS_LOG(ERROR) << " inputs must been >=2";
+    MS_LOG(ERROR) << " input sizes must >= 2 ";
     return RET_ERROR;
   } else {
-    MS_LOG(ERROR) << "only support inputs<=3";
+    MS_LOG(ERROR) << " only support inputs <= 3 ";
     return RET_ERROR;
   }
   ocl_runtime->RunKernel(kernel_, global, local, nullptr);
@@ -213,12 +186,12 @@ kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::tensor::Te
                                               const mindspore::lite::PrimitiveC *primitive) {
   auto *kernel = new (std::nothrow) ConcatOpenCLKernel(opParameter, inputs, outputs);
   if (kernel == nullptr) {
-    MS_LOG(ERROR) << "new ConcatOpenCLKernel failed";
+    MS_LOG(ERROR) << " new ConcatOpenCLKernel failed ";
     return nullptr;
   }
   auto ret = kernel->Init();
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
+    MS_LOG(ERROR) << " Init kernel failed, name: Concat ";
     delete kernel;
     return nullptr;
   }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
index 0f1d1ab13de9be4c701328eae9db29d9bbcf8a3d..a411bd6af61da5c071557aae8437e278e03c4e2d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
@@ -37,8 +37,6 @@ class ConcatOpenCLKernel : public OpenCLKernel {
 
   int ReSize() override;
 
-  int Run_axis0();
-
   int Run() override;
   int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
index be857a15f3c30a9bbb33d7e709e4d5abb1510ba2..6e8c76d84d573723cd974b7ab676dd57433e9c2c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
@@ -97,7 +97,7 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
   local->push_back(z);
 }
 int SliceOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running!";
+  MS_LOG(DEBUG) << this->name() << " Running! ";
   auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   auto input_shape = in_tensors_[0]->shape();
@@ -131,12 +131,12 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::tensor::Ten
                                              const mindspore::lite::PrimitiveC *primitive) {
   auto *kernel = new (std::nothrow) SliceOpenCLKernel(opParameter, inputs, outputs);
   if (kernel == nullptr) {
-    MS_LOG(ERROR) << "new SliceOpenCLKernel failed";
+    MS_LOG(ERROR) << " new SliceOpenCLKernel failed ";
     return nullptr;
   }
   auto ret = kernel->Init();
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: Convolution";
+    MS_LOG(ERROR) << " Init kernel failed, name: Slice ";
     delete kernel;
     return nullptr;
   }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
index 93b67fba4ef6976a86ed1f9550b8c73a75f7297b..f6721304ffc5763cddf8ce283f23bba75b3ebf23 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -94,7 +94,7 @@ int ToFormatOpenCLKernel::InitNHWCShape() {
 int ToFormatOpenCLKernel::ReSize() { return RET_OK; }
 
 int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
-  std::vector<size_t> vec = {nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
+  std::vector<size_t> vec = {nhwc_shape_[0] * nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
   *global_size = std::move(vec);
   return RET_OK;
 }
@@ -107,13 +107,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
   size_t im_dst_x, im_dst_y;
   std::vector<int> shapex = out_tensors_[0]->shape();
   if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
-    int c = shapex[1];
-    int h = shapex[2];
+    int c = shapex[1] * shapex[2];
+    int h = shapex[0];
     int w = shapex[3];
     im_dst_y = h * UP_DIV(c, C4NUM);
     im_dst_x = w;
   } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
-    int h = shapex[1];
+    int h = shapex[0] * shapex[1];
     int w = shapex[2];
     int c = shapex[3];
     im_dst_x = w * UP_DIV(c, C4NUM);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
index ab95c9e57efe0c9e7ca88c17b2bbdd124aa67c14..7ab29a333a8eb35b159d4bb7d0327b09b114f023 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc
@@ -32,13 +32,6 @@ class TestBatchnormOpenCLfp16 : public mindspore::CommonTest {
   TestBatchnormOpenCLfp16() {}
 };
 
-template <typename T>
-void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) {
-  for (size_t i = 0; i < size; i++) {
-    T abs = fabs(output_data[i] - correct_data[i]);
-    ASSERT_LE(abs, err_bound);
-  }
-}
 TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   MS_LOG(INFO) << "begin test";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
@@ -46,7 +39,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  MS_LOG(INFO) << "Read tensors from .bin";
+  MS_LOG(INFO) << " Read tensors from .bin ";
   std::vector<int> input_shape = {1, 256, 256, 48};
   std::vector<int> output_shape = {1, 256, 256, 48};
   auto data_type = kNumberTypeFloat32;
@@ -59,7 +52,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   std::string var_path = "./test_data/batchnorm_varfp16.bin";
   std::string offset_path = "./test_data/batchnorm_offsetfp16.bin";
   std::string scale_path = "./test_data/batchnorm_scalefp16.bin";
-  std::string output_path = "./test_data/batchnorm_out_datafp16.bin";
+  std::string output_path = "./test_data/batchnorm_correctdatafp16.bin";
   auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
   auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
   size_t mean_size, var_size, scale_size, offset_size;
@@ -68,7 +61,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   auto scale_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size));
   auto offset_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size));
 
-  MS_LOG(INFO) << "construct tensors";
+  MS_LOG(INFO) << " construct tensors ";
   lite::tensor::Tensor *tensor_data =
     new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
   lite::tensor::Tensor *tensor_mean =
@@ -81,13 +74,13 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
     new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
   if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr ||
       tensor_offset == nullptr) {
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     return;
   }
   auto *output_tensor =
     new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
   if (output_tensor == nullptr) {
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     delete tensor_data;
     delete tensor_mean;
     delete tensor_var;
@@ -98,10 +91,10 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   std::vector<lite::tensor::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var};
   std::vector<lite::tensor::Tensor *> outputs{output_tensor};
 
-  MS_LOG(INFO) << "initialize tensors";
+  MS_LOG(INFO) << " initialize tensors ";
   auto param = new (std::nothrow) BatchNormParameter();
   if (param == nullptr) {
-    MS_LOG(INFO) << "new BatchNormParameter failed";
+    MS_LOG(INFO) << " new BatchNormParameter failed ";
     for (auto tensor : outputs) {
       delete tensor;
     }
@@ -111,7 +104,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   auto *batchnorm_kernel =
     new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
   if (batchnorm_kernel == nullptr) {
-    MS_LOG(INFO) << "new kernel::BatchNorm_kernel failed";
+    MS_LOG(INFO) << " new kernel::BatchNorm_kernel failed ";
     for (auto tensor : outputs) {
       delete tensor;
     }
@@ -125,11 +118,11 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
     input_tensor->MallocData(allocator);
   }
 
-  MS_LOG(INFO) << "initialize sub_graph";
+  MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
     for (auto tensor : outputs) {
       delete tensor;
     }
@@ -138,7 +131,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
     return;
   }
   sub_graph->Init();
-  MS_LOG(INFO) << "init tensors";
+  MS_LOG(INFO) << " init tensors ";
   memcpy(inputs[0]->Data(), input_data, input_size);
   memcpy(inputs[1]->Data(), scale_data, scale_size);
   memcpy(inputs[2]->Data(), offset_data, offset_size);
@@ -148,7 +141,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   sub_graph->Run();
 
   auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
-  CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
+  CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.01);
   for (auto tensor : inputs) {
     delete tensor;
   }
@@ -158,15 +151,14 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) {
   delete param;
   delete batchnorm_kernel;
   delete sub_graph;
-  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
-  MS_LOG(INFO) << "begin test";
+  MS_LOG(INFO) << " begin test ";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  MS_LOG(INFO) << "Read tensors from .bin";
+  MS_LOG(INFO) << " Read tensors from .bin ";
   std::vector<int> input_shape = {1, 256, 256, 47};
   std::vector<int> output_shape = {1, 256, 256, 47};
   auto data_type = kNumberTypeFloat32;
@@ -188,7 +180,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
   auto scale_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size));
   auto offset_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size));
 
-  MS_LOG(INFO) << "construct tensors";
+  MS_LOG(INFO) << " construct tensors ";
   lite::tensor::Tensor *tensor_data =
     new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
   lite::tensor::Tensor *tensor_mean =
@@ -201,13 +193,13 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
     new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type);
   if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr ||
       tensor_offset == nullptr) {
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     return;
   }
   auto *output_tensor =
     new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
   if (output_tensor == nullptr) {
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     delete tensor_data;
     delete tensor_mean;
     delete tensor_var;
@@ -218,10 +210,10 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
   std::vector<lite::tensor::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var};
   std::vector<lite::tensor::Tensor *> outputs{output_tensor};
 
-  MS_LOG(INFO) << "initialize tensors";
+  MS_LOG(INFO) << " initialize tensors ";
   auto param = new (std::nothrow) BatchNormParameter();
   if (param == nullptr) {
-    MS_LOG(INFO) << "new BatchNormParameter failed";
+    MS_LOG(INFO) << " new BatchNormParameter failed ";
     for (auto tensor : outputs) {
       delete tensor;
     }
@@ -231,7 +223,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
   auto *batchnorm_kernel =
     new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
   if (batchnorm_kernel == nullptr) {
-    MS_LOG(INFO) << "new kernel::BatchNorm_kernel failed";
+    MS_LOG(INFO) << " new kernel::BatchNorm_kernel failed ";
     for (auto tensor : outputs) {
       delete tensor;
     }
@@ -245,11 +237,11 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
     input_tensor->MallocData(allocator);
   }
 
-  MS_LOG(INFO) << "initialize sub_graph";
+  MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
     for (auto tensor : outputs) {
       delete tensor;
     }
@@ -258,7 +250,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
     return;
   }
   sub_graph->Init();
-  MS_LOG(INFO) << "init tensors";
+  MS_LOG(INFO) << " init tensors ";
   memcpy(inputs[0]->Data(), input_data, input_size);
   memcpy(inputs[1]->Data(), scale_data, scale_size);
   memcpy(inputs[2]->Data(), offset_data, offset_size);
@@ -268,7 +260,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
   sub_graph->Run();
 
   auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
-  CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
+  CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001);
   for (auto tensor : inputs) {
     delete tensor;
   }
@@ -278,6 +270,5 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) {
   delete param;
   delete batchnorm_kernel;
   delete sub_graph;
-  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
index 08f6223de278ce48e6232f6a20cd28f5f8a26ad1..c72dfa672025e50a9a78846738805a4c7873e417 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
@@ -18,70 +18,10 @@
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
+#include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h"
 
-template <typename T>
-void ConcatComputeByCPU_2input_dim4_axis3(const T *input0, const T *input1, T *output, std::vector<int> input_shape0,
-                                          std::vector<int> input_shape1, std::vector<int> output_shape,
-                                          const int axis) {
-  int postion, index0 = 0, index1 = 0;
-  for (int i = 0; i < output_shape[0]; i++) {
-    for (int j = 0; j < output_shape[1]; j++) {
-      for (int k = 0; k < output_shape[2]; k++) {
-        postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] +
-                  k * output_shape[3];
-        for (int w = 0; w < output_shape[3]; w++) {
-          if (w < input_shape0[3] + input_shape1[3]) {
-            output[postion++] = (w < input_shape0[3]) ? input0[index0++] : input1[index1++];
-          } else {
-            for (int ind = input_shape0[3] + input_shape1[3]; ind < output_shape[3]; ind++) {
-              output[postion++] = 0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-template <typename T>
-void ConcatComputeByCPU_3input_dim4_axis3(T *input0, T *input1, T *input2, T *output, std::vector<int> input_shape0,
-                                          std::vector<int> input_shape1, std::vector<int> input_shape2,
-                                          std::vector<int> output_shape, const int axis) {
-  int postion, index0 = 0, index1 = 0, index2 = 0;
-  for (int i = 0; i < output_shape[0]; i++) {
-    for (int j = 0; j < output_shape[1]; j++) {
-      for (int k = 0; k < output_shape[2]; k++) {
-        postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] +
-                  k * output_shape[3];
-        for (int w = 0; w < output_shape[3]; w++) {
-          if (w < input_shape0[3]) {
-            int align = UP_DIV(input_shape0[3], 4) * 4;
-            index0 = i * input_shape0[1] * input_shape0[2] * align + j * input_shape0[2] * align + k * align + w;
-            output[postion++] = input0[index0];
-          } else if (w >= input_shape0[3] && w < (input_shape0[3] + input_shape1[3])) {
-            int align = UP_DIV(input_shape1[3], 4) * 4;
-            index1 = i * input_shape1[1] * input_shape1[2] * align + j * input_shape1[2] * align + k * align + w -
-                     input_shape0[3];
-            output[postion++] = input1[index1];
-          } else if ((input_shape0[3] + input_shape1[3]) <= w &&
-                     w < (input_shape0[3] + input_shape1[3] + input_shape2[3])) {
-            int align = UP_DIV(input_shape2[3], 4) * 4;
-            index2 = i * input_shape2[1] * input_shape2[2] * align + j * input_shape2[2] * align + k * align + w -
-                     input_shape0[3] - input_shape1[3];
-            output[postion++] = input2[index2];
-          } else {
-            for (int ind = input_shape0[3] + input_shape1[3] + input_shape2[3]; ind < output_shape[3]; ind++) {
-              output[postion++] = 0;
-            }
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
 namespace mindspore {
 class TestConcatOpenCLfp32 : public mindspore::CommonTest {
  public:
@@ -100,17 +40,29 @@ void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bou
   }
 }
 TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
-  MS_LOG(INFO) << "begin test";
+  MS_LOG(INFO) << " begin test ";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->SetFp16Enable(true);
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  MS_LOG(INFO) << "init tensors";
-  constexpr int INPUT_NUM = 3;
-  std::array<std::vector<int>, INPUT_NUM> input_shapes = {
-    std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}};
-  std::vector<int> output_shape = {1, 16, 256, 240};
+  // get the input from .bin
+  size_t input1_size, input2_size, input3_size, output_size;
+  std::string input1Ppath = "./test_data/concatfp16_input1.bin";
+  std::string input2Ppath = "./test_data/concatfp16_input2.bin";
+  std::string input3Ppath = "./test_data/concatfp16_input3.bin";
+  std::string correctOutputPath = "./test_data/concatfp16_output.bin";
+  auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
+  auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
+  auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
+  auto correctOutput =
+    reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
+
+  MS_LOG(INFO) << " init tensors ";
+  constexpr int INPUT_NUM = 2;
+  std::array<std::vector<int>, INPUT_NUM> input_shapes = {std::vector<int>{1, 19, 19, 96},
+                                                          std::vector<int>{1, 19, 19, 96}};
+  std::vector<int> output_shape = {2, 19, 19, 96};
   auto data_type = kNumberTypeFloat16;
   auto tensor_type = schema::NodeType_ValueNode;
   std::vector<lite::tensor::Tensor *> inputs;
@@ -118,26 +70,26 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
     auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC4, tensor_type);
     inputs.push_back(input_temp);
     if (input_temp == nullptr) {
-      MS_LOG(INFO) << "new input_tensor failed";
+      MS_LOG(INFO) << " new input_tensor failed ";
       return;
     }
   }
   auto *output_tensor =
     new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
   if (output_tensor == nullptr) {
-    MS_LOG(INFO) << "new output_tensor failed";
+    MS_LOG(INFO) << " new output_tensor failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
     return;
   }
   std::vector<lite::tensor::Tensor *> outputs{output_tensor};
-  MS_LOG(INFO) << "input_shapes size=: " << input_shapes.size();
+  MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size();
 
-  MS_LOG(INFO) << "initialize tensors";
+  MS_LOG(INFO) << " initialize tensors ";
   auto param = new (std::nothrow) ConcatParameter();
   if (param == nullptr) {
-    MS_LOG(INFO) << "new ConcatParameter failed";
+    MS_LOG(INFO) << " new ConcatParameter failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
@@ -146,11 +98,11 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
     }
     return;
   }
-  param->axis_ = 3;
+  param->axis_ = 0;
   auto *concat_kernel =
     new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
   if (concat_kernel == nullptr) {
-    MS_LOG(INFO) << "new kernel::ConcatOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
@@ -165,12 +117,11 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
   for (auto &input_tensor : inputs) {
     input_tensor->MallocData(allocator);
   }
-
-  MS_LOG(INFO) << "initialize sub_graph";
+  MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{concat_kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
@@ -182,33 +133,22 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
     return;
   }
   sub_graph->Init();
-  unsigned int seed = 123;
-  MS_LOG(INFO) << "initialize input data";
-  for (auto &input_tensor : inputs) {
-    auto input_data = reinterpret_cast<float16_t *>(input_tensor->Data());
-    for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
-      input_data[i] = static_cast<float16_t>(rand_r(&seed) % 10 + 1);
-    }
-  }
-
-  // compute the result for CPU
-  auto *input_data0 = reinterpret_cast<float16_t *>(inputs[0]->Data());
-  auto *input_data1 = reinterpret_cast<float16_t *>(inputs[1]->Data());
-  std::vector<float16_t> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]);
+  MS_LOG(INFO) << " initialize input data ";
   if (inputs.size() == 2) {
-    ConcatComputeByCPU_2input_dim4_axis3(input_data0, input_data1, output_data_cpu.data(), input_shapes[0],
-                                         input_shapes[1], output_shape, param->axis_);
-  }
-  if (inputs.size() == 3) {
-    auto *input_data2 = reinterpret_cast<float16_t *>(inputs[2]->Data());
-    ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0],
-                                         input_shapes[1], input_shapes[2], output_shape, param->axis_);
+    memcpy(inputs[0]->Data(), input_data1, input1_size);
+    memcpy(inputs[1]->Data(), input_data2, input2_size);
+  } else if (inputs.size() == 3) {
+    memcpy(inputs[0]->Data(), input_data1, input1_size);
+    memcpy(inputs[1]->Data(), input_data2, input2_size);
+    memcpy(inputs[2]->Data(), input_data3, input3_size);
+  } else {
+    MS_LOG(ERROR) << " input size must be 2 or 3";
   }
 
   std::cout << "==================output data================" << std::endl;
   sub_graph->Run();
   auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data());
-  CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
+  CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
   for (auto tensor : inputs) {
     delete tensor;
   }
@@ -218,47 +158,57 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) {
   delete param;
   delete concat_kernel;
   delete sub_graph;
-  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 
 TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
-  MS_LOG(INFO) << "begin test";
+  MS_LOG(INFO) << " begin test ";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  MS_LOG(INFO) << "init tensors";
+  // get the input from .bin
+  size_t input1_size, input2_size, input3_size, output_size;
+  std::string input1Ppath = "./test_data/concat_input1.bin";
+  std::string input2Ppath = "./test_data/concat_input2.bin";
+  std::string input3Ppath = "./test_data/concat_input3.bin";
+  std::string correctOutputPath = "./test_data/concat_output.bin";
+  auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
+  auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
+  auto input_data3 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
+  auto correctOutput = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
+
+  MS_LOG(INFO) << " init tensors ";
   constexpr int INPUT_NUM = 3;
   std::array<std::vector<int>, INPUT_NUM> input_shapes = {
     std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}};
-  std::vector<int> output_shape = {1, 16, 256, 240};
+  std::vector<int> output_shape = {1, 48, 256, 80};
   auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
   std::vector<lite::tensor::Tensor *> inputs;
   for (auto &shape : input_shapes) {
-    auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC4, tensor_type);
+    auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
     inputs.push_back(input_temp);
     if (input_temp == nullptr) {
-      MS_LOG(INFO) << "new input_tensor failed";
+      MS_LOG(INFO) << " new input_tensor failed ";
       return;
     }
   }
   auto *output_tensor =
-    new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
+    new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
   if (output_tensor == nullptr) {
-    MS_LOG(INFO) << "new output_tensor failed";
+    MS_LOG(INFO) << " new output_tensor failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
     return;
   }
   std::vector<lite::tensor::Tensor *> outputs{output_tensor};
-  MS_LOG(INFO) << "input_shapes size=: " << input_shapes.size();
+  MS_LOG(INFO) << " input_shapes size=: " << input_shapes.size();
 
-  MS_LOG(INFO) << "initialize tensors";
+  MS_LOG(INFO) << " initialize tensors ";
   auto param = new (std::nothrow) ConcatParameter();
   if (param == nullptr) {
-    MS_LOG(INFO) << "new ConcatParameter failed";
+    MS_LOG(INFO) << " new ConcatParameter failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
@@ -267,11 +217,11 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
     }
     return;
   }
-  param->axis_ = 3;
+  param->axis_ = 1;
   auto *concat_kernel =
     new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
   if (concat_kernel == nullptr) {
-    MS_LOG(INFO) << "new kernel::ConcatOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
@@ -287,11 +237,11 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
     input_tensor->MallocData(allocator);
   }
 
-  MS_LOG(INFO) << "initialize sub_graph";
+  MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{concat_kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
-    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
     for (auto tensor : inputs) {
       delete tensor;
     }
@@ -303,33 +253,22 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
     return;
   }
   sub_graph->Init();
-  unsigned int seed = 123;
-  MS_LOG(INFO) << "initialize input data";
-  for (auto &input_tensor : inputs) {
-    auto input_data = reinterpret_cast<float *>(input_tensor->Data());
-    for (int i = 0; i < input_tensor->ElementsNum(); ++i) {
-      input_data[i] = static_cast<float>(rand_r(&seed) % 10 + 1);
-    }
-  }
-
-  // compute the result for CPU
-  auto *input_data0 = reinterpret_cast<float *>(inputs[0]->Data());
-  auto *input_data1 = reinterpret_cast<float *>(inputs[1]->Data());
-  std::vector<float> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]);
+  MS_LOG(INFO) << " initialize input data ";
   if (inputs.size() == 2) {
-    ConcatComputeByCPU_2input_dim4_axis3(input_data0, input_data1, output_data_cpu.data(), input_shapes[0],
-                                         input_shapes[1], output_shape, param->axis_);
-  }
-  if (inputs.size() == 3) {
-    auto *input_data2 = reinterpret_cast<float *>(inputs[2]->Data());
-    ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0],
-                                         input_shapes[1], input_shapes[2], output_shape, param->axis_);
+    memcpy(inputs[0]->Data(), input_data1, input1_size);
+    memcpy(inputs[1]->Data(), input_data2, input2_size);
+  } else if (inputs.size() == 3) {
+    memcpy(inputs[0]->Data(), input_data1, input1_size);
+    memcpy(inputs[1]->Data(), input_data2, input2_size);
+    memcpy(inputs[2]->Data(), input_data3, input3_size);
+  } else {
+    MS_LOG(ERROR) << " input size must be 2 or 3 ";
   }
 
   std::cout << "==================output data================" << std::endl;
   sub_graph->Run();
   auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data());
-  CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001);
+  CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001);
   for (auto tensor : inputs) {
     delete tensor;
   }
@@ -339,6 +278,5 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) {
   delete param;
   delete concat_kernel;
   delete sub_graph;
-  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
index 33ff054d32b8980479d561fcfb4aaafe1e817faf..3faefcd93509871f369d48460b5ab79c5dfbbf40 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc
@@ -41,44 +41,43 @@ void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bou
 }
 
 TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
-  MS_LOG(INFO) << "begin test";
+  MS_LOG(INFO) << " begin test ";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  MS_LOG(INFO) << "Read tensors from .bin";
-  std::vector<int> input_shape = {1, 256, 256, 48};
-  std::vector<int> output_shape = {1, 255, 255, 15};
-  std::vector<int> begin = {0, 1, 1, 7};
-  std::vector<int> size = {1, 255, 255, 15};
+  MS_LOG(INFO) << " Read tensors from .bin ";
+  std::vector<int> input_shape = {1, 19, 19, 96};
+  std::vector<int> output_shape = {1, 10, 10, 13};
+  std::vector<int> begin = {0, 2, 3, 4};
+  std::vector<int> size = {1, 10, 10, 13};
   auto data_type = kNumberTypeFloat32;
   auto tensor_type = schema::NodeType_ValueNode;
 
   // get the input from .bin
   size_t input_size, output_size;
-  std::string input_path = "./test_data/in_datafp32.bin";
-  std::string output_path = "./test_data/out_datafp32.bin";
+  std::string input_path = "./test_data/in_slicefp32.bin";
+  std::string output_path = "./test_data/out_slicefp32.bin";
   auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
   auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
-
-  MS_LOG(INFO) << "construct tensors";
+  MS_LOG(INFO) << " construct tensors ";
   lite::tensor::Tensor *tensor_data =
     new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
   if (tensor_data == nullptr) {
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     return;
   }
   auto *output_tensor =
     new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
   if (output_tensor == nullptr) {
     delete tensor_data;
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     return;
   }
   std::vector<lite::tensor::Tensor *> inputs = {tensor_data};
   std::vector<lite::tensor::Tensor *> outputs = {output_tensor};
 
-  MS_LOG(INFO) << "setting  SliceParameter";
+  MS_LOG(INFO) << "setting  SliceParameter ";
   auto param = new (std::nothrow) SliceParameter();
   if (param == nullptr) {
     for (auto tensor : inputs) {
@@ -87,7 +86,7 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
     for (auto tensor : outputs) {
       delete tensor;
     }
-    MS_LOG(INFO) << "new SliceParameter failed";
+    MS_LOG(INFO) << "new SliceParameter failed ";
     return;
   }
   for (int i = 0; i < input_shape.size(); i++) {
@@ -105,7 +104,7 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
       delete tensor;
     }
     delete param;
-    MS_LOG(INFO) << "new kernel::slice_kernel failed";
+    MS_LOG(INFO) << "new kernel::slice_kernel failed ";
     return;
   }
   slice_kernel->Init();
@@ -115,7 +114,7 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
     input_tensor->MallocData(allocator);
   }
 
-  MS_LOG(INFO) << "initialize sub_graph";
+  MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{slice_kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
@@ -127,12 +126,12 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
     }
     delete param;
     delete slice_kernel;
-    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
     return;
   }
   sub_graph->Init();
 
-  MS_LOG(INFO) << "init tensors";
+  MS_LOG(INFO) << " init tensors ";
   memcpy(inputs[0]->Data(), input_data, input_size);
 
   std::cout << "==================output data================" << std::endl;
@@ -148,16 +147,15 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) {
   }
   delete slice_kernel;
   delete sub_graph;
-  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
-  MS_LOG(INFO) << "begin test";
+  MS_LOG(INFO) << " begin test ";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->SetFp16Enable(true);
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  MS_LOG(INFO) << "Read tensors from .bin";
+  MS_LOG(INFO) << " Read tensors from .bin ";
   std::vector<int> input_shape = {1, 256, 256, 48};
   std::vector<int> output_shape = {1, 255, 255, 15};
   std::vector<int> begin = {0, 1, 1, 7};
@@ -172,24 +170,24 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
   auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
   auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
 
-  MS_LOG(INFO) << "construct tensors";
+  MS_LOG(INFO) << " construct tensors ";
   lite::tensor::Tensor *tensor_data =
     new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type);
   if (tensor_data == nullptr) {
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     return;
   }
   auto *output_tensor =
     new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type);
   if (output_tensor == nullptr) {
     delete tensor_data;
-    MS_LOG(INFO) << "init tensor failed";
+    MS_LOG(INFO) << " init tensor failed ";
     return;
   }
   std::vector<lite::tensor::Tensor *> inputs = {tensor_data};
   std::vector<lite::tensor::Tensor *> outputs = {output_tensor};
 
-  MS_LOG(INFO) << "setting  SliceParameter";
+  MS_LOG(INFO) << " setting  SliceParameter ";
   auto param = new (std::nothrow) SliceParameter();
   if (param == nullptr) {
     for (auto tensor : inputs) {
@@ -198,10 +196,10 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
     for (auto tensor : outputs) {
       delete tensor;
     }
-    MS_LOG(INFO) << "new SliceParameter failed";
+    MS_LOG(INFO) << " new SliceParameter failed ";
     return;
   }
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < input_shape.size(); i++) {
     param->begin_[i] = begin[i];
     param->size_[i] = size[i];
   }
@@ -216,7 +214,7 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
       delete tensor;
     }
     delete param;
-    MS_LOG(INFO) << "new kernel::slice_kernel failed";
+    MS_LOG(INFO) << " new kernel::slice_kernel failed ";
     return;
   }
   slice_kernel->Init();
@@ -226,7 +224,7 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
     input_tensor->MallocData(allocator);
   }
 
-  MS_LOG(INFO) << "initialize sub_graph";
+  MS_LOG(INFO) << " initialize sub_graph ";
   std::vector<kernel::LiteKernel *> kernels{slice_kernel};
   auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
   if (sub_graph == nullptr) {
@@ -238,12 +236,12 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
     }
     delete param;
     delete slice_kernel;
-    MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed";
+    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
     return;
   }
   sub_graph->Init();
 
-  MS_LOG(INFO) << "init tensors";
+  MS_LOG(INFO) << " init tensors ";
   memcpy(inputs[0]->Data(), input_data, input_size);
 
   std::cout << "==================output data================" << std::endl;
@@ -259,6 +257,5 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) {
   }
   delete slice_kernel;
   delete sub_graph;
-  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 }  // namespace mindspore