diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index d388e2c5385f0137538877db61a8cde4a99396af..7ca799e2e8701b8adb439218c17ce10d8fbd0f56 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -14,6 +14,7 @@
 
 #include <vector>
 
+#include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/eltwise.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -531,6 +532,100 @@ TEST_F(EltwiseOpTest, GPUSimpleTensorTensor) {
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
 }
 
+namespace {
+template <typename T>
+void GPUOverflowTest(const ops::EltwiseType type,
+                     const std::vector<index_t> &shape0,
+                     const std::vector<T> &input0,
+                     const std::vector<index_t> &shape1,
+                     const std::vector<T> &input1,
+                     const std::vector<index_t> &output_shape,
+                     const std::vector<T> &output) {
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<DeviceType::GPU, T>("Input0", shape0, input0);
+  net.AddInputFromArray<DeviceType::GPU, T>("Input1", shape1, input1);
+
+  OpDefBuilder("Eltwise", "EltwiseTest")
+      .AddIntArg("T", DataTypeToEnum<T>::v())
+      .Input("Input0")
+      .Input("Input1")
+      .AddIntArg("type", static_cast<int>(type))
+      .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+      .Output("EltOutput")
+      .OutputShape(output_shape)
+      .Finalize(net.AddNewOperatorDef());
+  net.AddInputFromArray<DeviceType::GPU, T>(
+      "Filter",
+      {output_shape.back(), shape0.back(), 3, 3},
+      std::vector<float>(output_shape.back() * shape0.back() * 9, 1));
+  OpDefBuilder("Conv2D", "Conv2D")
+      .AddIntArg("T", DataTypeToEnum<T>::v())
+      .Input("EltOutput")
+      .Input("Filter")
+      .Output("Output")
+      .OutputShape(output_shape)
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.AddNewOperatorDef());
+
+  // Run
+  net.RunOp(DeviceType::GPU);
+
+  auto expected = net.CreateTensor<T>(output_shape, output);
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+}  // namespace
+TEST_F(EltwiseOpTest, GPUOverflowTest) {
+  GPUOverflowTest<float>(
+      ops::EltwiseType::SUM, {1, 2, 2, 2}, std::vector<float>(8, 1),
+      {1, 1, 1, 2}, {1, 1},
+      {1, 2, 2, 1}, {16, 16, 16, 16});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::SUB, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {0, 0, 0, 0, -8, -8, -8, -8});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::PROD, {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, {4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::DIV, {2, 3, 2, 1}, std::vector<float>(12, 1),
+      {2, 3, 2, 1}, std::vector<float>(12, 1),
+      {2, 3, 2, 1}, {4, 4, 6, 6, 4, 4, 4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::MIN, {1, 2, 2, 2}, std::vector<float>(8, 1),
+      {1, 1, 1, 2}, {1, 1},
+      {1, 2, 2, 1}, {8, 8, 8, 8});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::MAX, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {8, 8, 8, 8, 16, 16, 16, 16});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::NEG, {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 1, 1, 1}, {0},
+      {1, 3, 2, 1}, {-4, -4, -6, -6, -4, -4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::ABS, {2, 3, 2, 1}, std::vector<float>(12, -1),
+      {1, 1, 1, 1}, {0},
+      {2, 3, 2, 1}, {4, 4, 6, 6, 4, 4, 4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::SQR_DIFF, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {0, 0, 0, 0, 8, 8, 8, 8});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::POW, {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, std::vector<float>(6, 1),
+      {1, 3, 2, 1}, {4, 4, 6, 6, 4, 4});
+  GPUOverflowTest<float>(
+      ops::EltwiseType::FLOOR_DIV, {2, 2, 2, 2}, std::vector<float>(16, 1),
+      {2, 1, 1, 2}, {1, 1, 2, 2},
+      {2, 2, 2, 1}, {8, 8, 8, 8, 0, 0, 0, 0});
+}
+
 namespace {
 template <typename T>
 void RandomTensorScalar(const ops::EltwiseType type,
diff --git a/mace/ops/opencl/cl/eltwise.cl b/mace/ops/opencl/cl/eltwise.cl
index 23ab75591a09e6b824c83dfacdfe43f32f5bf5d3..167e5341a8aa0737dccc1fa9c9ef865ddae55c34 100644
--- a/mace/ops/opencl/cl/eltwise.cl
+++ b/mace/ops/opencl/cl/eltwise.cl
@@ -3,7 +3,7 @@
 __kernel void eltwise(OUT_OF_RANGE_PARAMS
                       GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input0,
-#if INPUT_TYPE == 1
+#if defined(INPUT_SCALAR)
                       __private const float value,
 #else
                       __read_only image2d_t input1,
@@ -28,14 +28,14 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS
 
   const int pos = mad24(chan_idx, width, width_idx);
   DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(pos, hb));
-#if INPUT_TYPE == 1
+#if defined(INPUT_SCALAR)
   DATA_TYPE4 in1 = (DATA_TYPE4)(value, value, value, value);
-#elif INPUT_TYPE == 2
+#elif defined(INPUT_VECTOR)
+  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, 0));
+#elif defined(INPUT_BATCH_VECTOR)
   const int batch_idx = hb / height;
   DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, batch_idx));
-#elif INPUT_TYPE == 3
-  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(chan_idx, 0));
-#elif INPUT_TYPE == 4
+#elif defined(INPUT_TENSOR_BC_CHAN)
   DATA_TYPE4 tmp = READ_IMAGET(input1, SAMPLER, (int2)(width_idx, hb));
   DATA_TYPE4 in1 = (DATA_TYPE4)(tmp.x, tmp.x, tmp.x, tmp.x);
 #else
@@ -89,11 +89,11 @@ __kernel void eltwise(OUT_OF_RANGE_PARAMS
   #endif
 #endif
 
-#if ((INPUT_TYPE == 1 || INPUT_TYPE == 4) &&                            \
-     (ELTWISE_TYPE == 0 || ELTWISE_TYPE == 1 || ELTWISE_TYPE == 4 ||    \
-      ELTWISE_TYPE == 5 || ELTWISE_TYPE == 8 || ELTWISE_TYPE == 9)) ||  \
-    ((INPUT_TYPE != 1 || INPUT_TYPE != 4) &&                            \
-     (ELTWISE_TYPE == 3 || ELTWISE_TYPE == 9 || ELTWISE_TYPE == 11))
+#if defined(NOT_DIVISIBLE_FOUR) &&                                       \
+    ((ELTWISE_TYPE == 3 || ELTWISE_TYPE == 9 || ELTWISE_TYPE == 11)      \
+     || ((defined(INPUT_SCALAR) || defined(INPUT_TENSOR_BC_CHAN)) &&     \
+         (ELTWISE_TYPE == 0 || ELTWISE_TYPE == 1 || ELTWISE_TYPE == 4 || \
+          ELTWISE_TYPE == 5 || ELTWISE_TYPE == 8)))
   const int remain_channel = channel - 4 * chan_idx;
   if (remain_channel < 4) {
     switch (remain_channel) {
diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h
index 8e6d3bc758fcaf4167aa841e83c9194687314330..bc1a702532fcfec6f32866fc332bdfe717f79416 100644
--- a/mace/ops/opencl/image/eltwise.h
+++ b/mace/ops/opencl/image/eltwise.h
@@ -67,25 +67,52 @@ MaceStatus EltwiseKernel<T>::Compute(
     const Tensor *input1,
     Tensor *output) {
   bool swapped = false;
-  if (input1 != nullptr) {
+  std::string input1_type = "";
+  if (input1 == nullptr) {
+    input1_type = "INPUT_SCALAR";
+  } else {
     MACE_CHECK(input0->dim_size() == input1->dim_size() ||
         input0->dim_size() == 1 || input1->dim_size() == 1)
       << "Inputs of Eltwise op must be same shape";
+    MACE_CHECK(type_ != EltwiseType::EQUAL)
+      << "Eltwise op on GPU does not support EQUAL";
+    // broadcast
     if (input0->size() != input1->size()) {
       if (input0->size() < input1->size()) {
         std::swap(input0, input1);
         swapped = true;
       }
-      if (input1->dim_size() == 1) {
-        MACE_CHECK(input0->dim(3) == input1->dim(0))
-          << "Element-Wise op support broadcast on only-channel or non-channel dimension";  // NOLINT(whitespace/line_length)
-      } else {
-        MACE_CHECK(((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1)
-            && input0->dim(3) == input1->dim(3) && input1->dim(1) == 1 &&
-            input1->dim(2) == 1) || (input0->dim(0) == input1->dim(0) &&
-            input0->dim(1) == input1->dim(1) && input0->dim(2) == input1->dim(2)
-            && input1->dim(3) == 1))
-          << "Element-Wise op support broadcast on only-channel or non-channel dimension";  // NOLINT(whitespace/line_length)
+      if (input1->dim_size() == 1
+          || (input1->dim(0) == 1 && input1->dim(1) == 1
+              && input1->dim(2) == 1)) {
+        // Tensor-Vector element wise
+        if (input0->dim(3) == input1->dim(input1->dim_size()-1)) {
+          input1_type = "INPUT_VECTOR";
+        } else {
+          LOG(FATAL) << "Inputs not match the broadcast logic, "
+                     << MakeString(input0->shape()) << " vs "
+                     << MakeString(input1->shape());
+        }
+      } else {  // must be 4-D
+        if (input0->dim(0) == input1->dim(0)
+            && input1->dim(1) == 1
+            && input1->dim(2) == 1
+            && input0->dim(3) == input1->dim(3)) {
+          input1_type = "INPUT_BATCH_VECTOR";
+        } else if (input0->dim(0) == input1->dim(0)
+            && input0->dim(1) == input1->dim(1)
+            && input0->dim(2) == input1->dim(2)
+            && input1->dim(3) == 1) {
+          // broadcast on channel dimension
+          input1_type = "INPUT_TENSOR_BC_CHAN";
+        } else {
+          LOG(FATAL) << "Element-Wise op only support broadcast on"
+                        " channel dimension:"
+                        "Tensor-BatchVector(4D-[N,1,1,C]) "
+                        "and Tensor-Tensor(4D-[N,H,W,1]). but got "
+                     << MakeString(input0->shape()) << " vs "
+                     << MakeString(input1->shape());
+        }
       }
     }
   }
@@ -129,20 +156,11 @@ MaceStatus EltwiseKernel<T>::Compute(
     built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
     built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (input1 == nullptr) {
-      built_options.emplace("-DINPUT_TYPE=1");
-    } else if (input0->size() != input1->size()) {
-      if (input0->dim(0) == input1->dim(0) && input0->dim(1) == input1->dim(1)
-          && input0->dim(2) == input1->dim(2) && input1->dim(3) == 1) {
-        // only broadcast on channel
-        built_options.emplace("-DINPUT_TYPE=4");
-      } else if (input1->dim(0) == 1 || input1->dim_size() == 1) {
-        built_options.emplace("-DINPUT_TYPE=3");
-      } else {
-        built_options.emplace("-DINPUT_TYPE=2");
-      }
-      if (swapped) built_options.emplace("-DSWAPPED");
+    if (!input1_type.empty()) {
+      built_options.emplace("-D" + input1_type);
     }
+    if (swapped) built_options.emplace("-DSWAPPED");
+    if (channels % 4 != 0) built_options.emplace("-DNOT_DIVISIBLE_FOUR");
     if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
     MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
                                               built_options, &kernel_));
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index 0604bdc410925977224a5f82763b6234ec40722a..ce9c1bbde07ddd8857f33718f06eb47d1fb34fa9 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -159,16 +159,16 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() {
 
 bool OpsTestNet::Setup(mace::DeviceType device) {
   NetDef net_def;
-  for (auto &op_def_ : op_defs_) {
-    net_def.add_op()->CopyFrom(op_def_);
+  for (auto &op_def : op_defs_) {
+    net_def.add_op()->CopyFrom(op_def);
 
-    for (auto input : op_def_.input()) {
+    for (auto input : op_def.input()) {
       if (ws_.GetTensor(input) != nullptr &&
           !ws_.GetTensor(input)->is_weight()) {
         auto input_info = net_def.add_input_info();
         input_info->set_name(input);
         auto data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            op_def_, "data_format", DataFormat::DF_NONE);
+            op_def, "data_format", DataFormat::DF_NONE);
         input_info->set_data_format(data_format);
         auto &shape = ws_.GetTensor(input)->shape();
         for (auto d : shape) {
@@ -176,16 +176,16 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
         }
       }
     }
-
-    for (int i = 0; i < op_def_.output_size(); ++i) {
-      ws_.RemoveTensor(op_def_.output(i));
-      auto output_info = net_def.add_output_info();
-      output_info->set_name(op_def_.output(i));
-      if (op_def_.output_type_size() == op_def_.output_size()) {
-        output_info->set_data_type(op_def_.output_type(i));
-      } else {
-        output_info->set_data_type(DataType::DT_FLOAT);
-      }
+  }
+  auto op_def = op_defs_.back();
+  for (int i = 0; i < op_def.output_size(); ++i) {
+    ws_.RemoveTensor(op_def.output(i));
+    auto output_info = net_def.add_output_info();
+    output_info->set_name(op_def.output(i));
+    if (op_def.output_type_size() == op_def.output_size()) {
+      output_info->set_data_type(op_def.output_type(i));
+    } else {
+      output_info->set_data_type(DataType::DT_FLOAT);
     }
   }
   MemoryOptimizer mem_optimizer;