diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
index 4e9a8422be6b3044a2fa8d4863ba98a75039dab3..691d91ef9beec3451099950b24f1d044a36c6fc2 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
@@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo
   output[idx] = input_a[idx] * input_b[idx];
 }
 
-__kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
+__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
+                             const unsigned int n) {
   int idx = get_global_id(0);
   if (idx >= n) return;
-  output[idx] = input_a[idx] + input_b;
-}
-
-__kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] - input_b;
-}
-
-__kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b;
-}
-
-__kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b;
+  output[idx] = weight * input_a[idx] + bias;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
index 12f75438f96a8e24d0af817b6e9f3754ae5a2707..33d6143f9d4bf91be493a9685881887870f52a22 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
@@ -1,15 +1,65 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
-__kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output,
-                         const int4 output_shape) {
+__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return;
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
 
-  if (idx >= n) return;
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z));
-  src = a + b;
-  write_imagef(output, (int2)(0, 0), src);
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a + b);
+}
+
+__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a - b);
+}
+
+__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a * b);
+}
+
+__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a / b);
+}
+
+__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
+                             const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), weight * a + bias);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index 8cd3113797d3401dc0e4038d3d38f174bb2b55a8..1deeaf51b44092a7d2b2419e3bbd08a995502594 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -40,10 +40,10 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
 }
 
 void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
-  global_size_ = InitGlobalSize();
-  int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
-  local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
-  global_size_ = GetCommonGlobalSize(local_size_, global_size_);
+  size_t H = outputs_[0]->Batch() * outputs_[0]->Height();
+  size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
+  local_size_ = {16, 16};
+  global_size_ = {H, W};
 }
 
 void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
@@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
   global_size_ = {element_num};
 }
 
+int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* img_size) {
+  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
+  int H = outputs_[0]->Batch() * outputs_[0]->Height();
+  int W = outputs_[0]->Width() * CO4;
+  size_t im_dst_x, im_dst_y;
+  if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
+    im_dst_x = W;
+    im_dst_y = H;
+  } else {
+    im_dst_y = outputs_[0]->Batch() * outputs_[0]->Height() * CO4;
+    im_dst_x = outputs_[0]->Width();
+  }
+#ifdef ENABLE_FP16
+  size_t img_dtype = CL_HALF_FLOAT;
+#else
+  size_t img_dtype = CL_FLOAT;
+#endif
+  img_size->clear();
+  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
+  *img_size = vec;
+  return 0;
+}
+
 int ArithmeticOpenCLKernel::Init() {
   runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
-  std::string element_name;
-  std::string boardcast_name;
+  std::string kernel_name;
 
   if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) {
     element_flag_ = false;
+    kernel_name = "BoardcastArith";
   } else {
     element_flag_ = true;
+    switch (opParameter->type_) {
+      case PrimitiveType_Mul:
+        kernel_name = "ElementMul";
+        break;
+      case PrimitiveType_Add:
+        kernel_name = "ElementAdd";
+        break;
+      case PrimitiveType_Sub:
+        kernel_name = "ElementSub";
+        break;
+      case PrimitiveType_Div:
+        kernel_name = "ElementDiv";
+        break;
+      default:
+        MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
+        break;
+    }
   }
 
-  switch (opParameter->type_) {
-    case PrimitiveType_Mul:
-      element_name = "ElementMul";
-      boardcast_name = "BoardcastMul";
-      break;
-    case PrimitiveType_Add:
-      element_name = "ElementAdd";
-      boardcast_name = "BoardcastAdd";
-      break;
-    case PrimitiveType_Sub:
-      element_name = "ElementSub";
-      boardcast_name = "BoardcastSub";
-      break;
-    case PrimitiveType_Div:
-      element_name = "ElementDiv";
-      boardcast_name = "BoardcastDiv";
-      break;
-    default:
-      MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
-      break;
-  }
 
 #ifdef PROGRAM_WITH_IL
   runtime_->CreateKernelFromIL(kernel_(), kernel_name);
 #else
   std::string program_name = "Arithmetic";
   std::set<std::string> build_options;
-  std::string source = arithmetic_buffer_source_fp32;
+  std::string source = arithmetic_image2d_source_fp32;
   runtime_->LoadSource(program_name, source);
-
-  if (element_flag_) {
-    runtime_->BuildKernel(kernel_, program_name, element_name, build_options);
-    MS_LOG(DEBUG) << element_name << " Init Done!";
-  } else {
-    runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options);
-    MS_LOG(DEBUG) << boardcast_name << " Init Done!";
-  }
+  runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
   outputs_[0]->SetFormat(schema::Format_NHWC4);
+  Image2dGetWorkGroupSize();
   return 0;
 }
 
 int ArithmeticOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->Name() << " Running!";
   auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
-  BufferGetWorkGroupSize();
 
   int arg_idx = 0;
   uint32_t element_num = outputs_[0]->ElementsC4Num();
@@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() {
   if (element_flag_) {
     runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data());
   } else {
-    runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float *>(inputs_[1]->Data())[0]);
+    float value = static_cast<float *>(inputs_[1]->Data())[0];
+    switch (opParameter->type_) {
+      case PrimitiveType_Mul:
+        weight_ = value;
+        break;
+      case PrimitiveType_Add:
+        bias_ = value;
+        break;
+      case PrimitiveType_Sub:
+        bias_ = -1 * value;
+        break;
+      case PrimitiveType_Div:
+        bias_ = 1 / value;
+        break;
+      default:
+        MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
+        break;
+    }
+    runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
+    runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
+    MS_LOG(DEBUG) << arg_idx-2 << " " << weight_;
+    MS_LOG(DEBUG) << arg_idx-1 << " " << bias_;
   }
   runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
-  runtime_->SetKernelArg(kernel_, arg_idx++, element_num);
-
+  int H = outputs_[0]->Batch() * outputs_[0]->Height();
+  int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
+  cl_int2 output_shape{H, W};
+  runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
   runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
   return 0;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
index 37143775b0c1bac1baf5fefcd591e62ba6246645..5210f8e771885b3db0373c1179b574e62bca43a6 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -24,15 +24,16 @@
 
 namespace mindspore::kernel {
 
-class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
+class ArithmeticOpenCLKernel : public OpenCLKernel {
  public:
   explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                   const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
-      : ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {}
+      : OpenCLKernel(parameter, inputs, outputs) {}
   ~ArithmeticOpenCLKernel() override{};
 
   int Init() override;
   int Run() override;
+  int GetImageSize(size_t idx, std::vector<size_t>* img_size) override;
 
  private:
   std::vector<size_t> InitGlobalSize() const;
@@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
   cl::Kernel kernel_;
   lite::opencl::OpenCLRuntime *runtime_;
   bool element_flag_{true};
+  float weight_{1.f};
+  float bias_{.0f};
 
   std::vector<size_t> local_size_;
   std::vector<size_t> global_size_;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
index e7fccfe1a3edc43763d85bbe729e8a4c51ef7e0d..5d9e99dd0b81cab6cbbd5d4b74985ab09180c5bb 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
@@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) {
 }
 
 void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) {
-  std::cout << "TestCase" << std::endl;
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto allocator = ocl_runtime->GetAllocator();
 
   bool is_bias_add = shape_b.empty();
   auto tensorType = schema::NodeType_ValueNode;
 
-  std::cout << "TestCase tensor" << std::endl;
   lite::tensor::Tensor *tensor_a =
     new lite::tensor::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType);
   lite::tensor::Tensor *tensor_b =
@@ -77,7 +76,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
   int64_t element_num = tensor_a->ElementsC4Num();
   int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num();
 
-  std::cout << "TestCase new data" << std::endl;
   float *data_a = new float[element_num];
   float *data_b = new float[element_num_b];
   float *data_c_cpu = new float[element_num];
@@ -87,14 +85,12 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
   InitData(data_b, element_num_b);
   memset(data_c_ocl, 0, sizeof(float) * element_num);
 
-  std::cout << "TestCase run cpu" << std::endl;
   if (is_bias_add) {
     BoardcaseAdd(data_a, static_cast<float *>(data_b)[0], data_c_cpu, element_num);
   } else {
     ElementAdd(data_a, data_b, data_c_cpu, element_num);
   }
 
-  std::cout << "TestCase set data" << std::endl;
   std::vector<lite::tensor::Tensor *> inputs = {tensor_a};
   if (!is_bias_add) {
     inputs.push_back(tensor_b);
@@ -114,9 +110,10 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
     new kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx);
   arith_kernel->Init();
 
+  tensor_a->MallocData(allocator);
+  tensor_b->MallocData(allocator);
   std::vector<kernel::LiteKernel *> kernels{arith_kernel};
   auto *kernel = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  std::cout << "TestCase Init" << std::endl;
   kernel->Init();
 
   memcpy(inputs[0]->Data(), data_a, sizeof(float) * element_num);
@@ -124,7 +121,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
     memcpy(inputs[1]->Data(), data_b, sizeof(float) * element_num_b);
   }
 
-  std::cout << "TestCase Run" << std::endl;
   kernel->Run();
 
   memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num);
@@ -136,7 +132,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
   LogData(outputs[0]->Data(), 10, "OpenCL compute : ");
   bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num);
   MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!");
-  std::cout << "TestCase End" << std::endl;
 
   // free
   delete[] data_a;
@@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common {
 };
 
 TEST_F(TestArithmeticOpenCL, AddElementwiseTest) {
-  const std::vector<int> &shape_a = {1, 32, 32, 4};
-  const std::vector<int> &shape_b = {1, 32, 32, 4};
+  const std::vector<int> &shape_a = {1, 1024, 1024, 4};
+  const std::vector<int> &shape_b = {1, 1024, 1024, 4};
   TestCase(shape_a, shape_b);
 }
 
-// TEST_F(TestOpenCLKernel, AddBoardcaseTest) {
-//  const std::vector<int> &shape_a = {1, 4, 128, 128};
-//  const std::vector<int> &shape_b = {};
-//  TestCase(shape_a, shape_b);
-//}
+TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) {
+  const std::vector<int> &shape_a = {1, 128, 128, 4};
+  const std::vector<int> &shape_b = {};
+  TestCase(shape_a, shape_b);
+}
 
 }  // namespace mindspore