diff --git a/mace/ops/common/reduce_type.h b/mace/ops/common/reduce_type.h
index 667f6bece40be4bfb4d0594c9920bcdb6a3e0918..18351a980ea71d0e02c29fd768ae0b6fbb55eda5 100644
--- a/mace/ops/common/reduce_type.h
+++ b/mace/ops/common/reduce_type.h
@@ -18,11 +18,11 @@
 
 namespace mace {
 enum ReduceType {
-//  SUM = 0,
   MEAN = 0,
   MIN = 1,
   MAX = 2,
   PROD = 3,
+  SUM = 4,
 //  SUM_SQR = 4,
 //  SQR_MEAN = 5,
 };
diff --git a/mace/ops/opencl/cl/reduce.cl b/mace/ops/opencl/cl/reduce.cl
index 509ecd0fcd23daf2923dc49dd317780238e6f69f..f7ea8697fefb37ee2b2b83c1659021ebc77fc451 100644
--- a/mace/ops/opencl/cl/reduce.cl
+++ b/mace/ops/opencl/cl/reduce.cl
@@ -62,7 +62,7 @@ __kernel void reduce(OUT_OF_RANGE_PARAMS
 // PROD
 #elif REDUCE_TYPE == 3
     part_result = part_result * in;
-// MEAN
+// MEAN or SUM
 #else
     part_result = part_result + in;
 #endif
diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc
index 28083312872d269d49b9b509525aa5ee6021b6b0..cd78320cbd350382f57a4f3f35a8e98cb32f27df 100644
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
@@ -167,6 +167,12 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
           tmp = tmp * input[i];
         }
         output[0] = tmp;
+      } else if (type == ReduceType::SUM) {
+        T tmp = 0;
+        for (int i = 0; i < data_reshape_[0]; ++i) {
+          tmp = tmp + input[i];
+        }
+        output[0] = tmp;
       } else {
         MACE_NOT_IMPLEMENTED;
       }
@@ -216,6 +222,14 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
             }
             output[i] = tmp;
           }
+        } else if (type == ReduceType::SUM) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = 0;
+            for (int j = 0; j < data_reshape_[0]; ++j) {
+              tmp += input[j * data_reshape_[1] + i];
+            }
+            output[i] = tmp;
+          }
         } else {
           MACE_NOT_IMPLEMENTED;
         }
@@ -254,6 +268,14 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
             }
             output[i] = tmp;
           }
+        } else if (type == ReduceType::SUM) {
+          for (index_t i = start; i < end; i += step) {
+            T tmp = 0;
+            for (int j = 0; j < data_reshape_[1]; ++j) {
+              tmp += input[i * data_reshape_[1] + j];
+            }
+            output[i] = tmp;
+          }
         } else {
           MACE_NOT_IMPLEMENTED;
         }
@@ -319,6 +341,16 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
             }
             output[i] = tmp;
           }
+        } else if (type == ReduceType::SUM) {
+          for (index_t i = start; i < end; i += step) {
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              for (int k = 0; k < data_reshape_[0]; ++k) {
+                output[i] +=
+                    input[(k * data_reshape_[1] + i) * data_reshape_[2]
+                        + j];
+              }
+            }
+          }
         } else {
           MACE_NOT_IMPLEMENTED;
         }
@@ -371,6 +403,16 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
               output[i * data_reshape_[2] + j] = tmp;
             }
           }
+        } else if (type == ReduceType::SUM) {
+          for (index_t i = start; i < end; i += step) {
+            for (int j = 0; j < data_reshape_[2]; ++j) {
+              for (int k = 0; k < data_reshape_[1]; ++k) {
+                output[i * data_reshape_[2] + j] +=
+                    input[(i * data_reshape_[1] + k) * data_reshape_[2]
+                        + j];
+              }
+            }
+          }
         } else {
           MACE_NOT_IMPLEMENTED;
         }
@@ -445,6 +487,18 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
               output[i * data_reshape_[3] + j] = tmp;
             }
           }
+        } else if (type == ReduceType::SUM) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              for (int k = 0; k < data_reshape_[2]; ++k) {
+                for (int t = 0; t < data_reshape_[0]; ++t) {
+                  output[i * data_reshape_[3] + j] +=
+                      input[((t * data_reshape_[1] + i) *
+                          data_reshape_[2] + k) * data_reshape_[3] + j];
+                }
+              }
+            }
+          }
         } else {
           MACE_NOT_IMPLEMENTED;
         }
@@ -513,6 +567,18 @@ class ReduceOp<DeviceType::CPU, T> : public ReduceOpBase {
               output[i * data_reshape_[2] + j] = tmp;
             }
           }
+        } else if (type == ReduceType::SUM) {
+          for (index_t i = start0; i < end0; i += step0) {
+            for (index_t j = start1; j < end1; j += step1) {
+              for (int k = 0; k < data_reshape_[1]; ++k) {
+                for (int t = 0; t < data_reshape_[3]; ++t) {
+                  output[i * data_reshape_[2] + j] +=
+                      input[((i * data_reshape_[1] + k) *
+                          data_reshape_[2] + j) * data_reshape_[3] + t];
+                }
+              }
+            }
+          }
         } else {
           MACE_NOT_IMPLEMENTED;
         }
@@ -574,6 +640,12 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce1Dims(
         tmp = std::max<uint8_t>(tmp, input[i]);
       }
       output[0] = tmp;
+    } else if (type == ReduceType::SUM) {
+      uint32_t tmp = 0;
+      for (int i = 0; i < data_reshape_[0]; ++i) {
+        tmp = tmp + input[i];
+      }
+      output[0] = static_cast<uint8_t>(tmp + data_reshape_[0] / 2);
     } else {
       MACE_NOT_IMPLEMENTED;
     }
@@ -616,6 +688,14 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
           }
           output[i] = tmp;
         }
+      }  else if (type == ReduceType::SUM) {
+        for (index_t i = start; i < end; i += step) {
+          uint32_t tmp = 0;
+          for (int j = 0; j < data_reshape_[0]; ++j) {
+            tmp += input[j * data_reshape_[1] + i];
+          }
+          output[i] = static_cast<uint8_t>(tmp + data_reshape_[0] / 2);
+        }
       } else {
         MACE_NOT_IMPLEMENTED;
       }
@@ -647,6 +727,14 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce2Dims(
           }
           output[i] = tmp;
         }
+      } else if (type == ReduceType::SUM) {
+        for (index_t i = start; i < end; i += step) {
+          uint32_t tmp = 0;
+          for (int j = 0; j < data_reshape_[1]; ++j) {
+            tmp += input[i * data_reshape_[1] + j];
+          }
+          output[i] = static_cast<uint8_t>(tmp + data_reshape_[1] / 2);
+        }
       } else {
         MACE_NOT_IMPLEMENTED;
       }
@@ -699,6 +787,17 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
           }
           output[i] = tmp;
         }
+      } else if (type == ReduceType::SUM) {
+        for (index_t i = start; i < end; i += step) {
+          uint32_t tmp = 0;
+          for (int j = 0; j < data_reshape_[2]; ++j) {
+            for (int k = 0; k < data_reshape_[0]; ++k) {
+              tmp += input[(k * data_reshape_[1] + i) * data_reshape_[2] + j];
+            }
+          }
+          index_t dim = data_reshape_[0] * data_reshape_[2];
+          output[i] = static_cast<uint8_t>(tmp + dim / 2);
+        }
       } else {
         MACE_NOT_IMPLEMENTED;
       }
@@ -742,6 +841,17 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce3Dims(
             output[i * data_reshape_[2] + j] = tmp;
           }
         }
+      } else if (type == ReduceType::SUM) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint32_t tmp = 0;
+            for (int k = 0; k < data_reshape_[1]; ++k) {
+              tmp += input[(i * data_reshape_[1] + k) * data_reshape_[2] + j];
+            }
+            output[i * data_reshape_[2] + j] =
+                static_cast<uint8_t>(tmp + data_reshape_[1] / 2);
+          }
+        }
       } else {
         MACE_NOT_IMPLEMENTED;
       }
@@ -804,6 +914,21 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
             output[i * data_reshape_[3] + j] = tmp;
           }
         }
+      } else if (type == ReduceType::SUM) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint32_t tmp = 0;
+            for (int k = 0; k < data_reshape_[2]; ++k) {
+              for (int t = 0; t < data_reshape_[0]; ++t) {
+                tmp += input[((t * data_reshape_[1] + i) *
+                    data_reshape_[2] + k) * data_reshape_[3] + j];
+              }
+            }
+            index_t dim = data_reshape_[0] * data_reshape_[2];
+            output[i * data_reshape_[3] + j] =
+                static_cast<uint8_t>(tmp + dim / 2);
+          }
+        }
       } else {
         MACE_NOT_IMPLEMENTED;
       }
@@ -858,6 +983,21 @@ void ReduceOp<DeviceType::CPU, uint8_t>::Reduce4Dims(
             output[i * data_reshape_[2] + j] = tmp;
           }
         }
+      } else if (type == ReduceType::SUM) {
+        for (index_t i = start0; i < end0; i += step0) {
+          for (index_t j = start1; j < end1; j += step1) {
+            uint32_t tmp = 0;
+            for (int k = 0; k < data_reshape_[1]; ++k) {
+              for (int t = 0; t < data_reshape_[3]; ++t) {
+                tmp += input[((i * data_reshape_[1] + k) *
+                    data_reshape_[2] + j) * data_reshape_[3] + t];
+              }
+            }
+            index_t dim = data_reshape_[1] * data_reshape_[3];
+            output[i * data_reshape_[2] + j] =
+                static_cast<uint8_t>(tmp + dim / 2);
+          }
+        }
       } else {
         MACE_NOT_IMPLEMENTED;
       }
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 4e49c96601a2828d3b17b160aa4a4f169746ac2c..dc617f12e3ff5b183cd7f78c9379306b82bff1f8 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -88,6 +88,7 @@ class ReduceType(Enum):
     MIN = 1
     MAX = 2
     PROD = 3
+    SUM = 4
 
 
 class PadType(Enum):
diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py
index 584796798f25349b5be5b962d29d4acc281e72f9..a63101a1af8661eb16aa1b094b60aae52f2877f9 100644
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -70,6 +70,7 @@ TFSupportedOps = [
     'Square',
     'SquaredDifference',
     'Rsqrt',
+    'Sum',
     'Equal',
     'Relu',
     'LeakyRelu',
@@ -188,6 +189,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
         TFOpType.Max.name: ReduceType.MAX,
         TFOpType.Mean.name: ReduceType.MEAN,
         TFOpType.Prod.name: ReduceType.PROD,
+        TFOpType.Sum.name: ReduceType.SUM,
     }
 
     pad_type = {
@@ -268,6 +270,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
             TFOpType.MirrorPad.name: self.convert_pad,
             TFOpType.Cumsum.name: self.convert_cumsum,
             TFOpType.OneHot.name: self.convert_one_hot,
+            TFOpType.Sum.name: self.convert_reduce,
         }
         self._option = option
         self._mace_net_def = mace_pb2.NetDef()
@@ -909,7 +912,10 @@ class TensorflowConverter(base_converter.ConverterInterface):
                     reduce_dims = tf_op.get_attr('reduction_indices')
                 except ValueError:
                     reduce_dims = []
-        axis_arg.ints.extend(reduce_dims)
+        if isinstance(reduce_dims, list):
+            axis_arg.ints.extend(reduce_dims)
+        else:
+            axis_arg.ints.append(reduce_dims)
         keep_dims_arg = op.arg.add()
         keep_dims_arg.name = MaceKeyword.mace_keepdims_str
         try:
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 6a07db0588ffd04bbd9480fd2a148c1fed423b93..fe5b08d6599b849beb2da0d05a7aee9785006166 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -1205,12 +1205,13 @@ class Transformer(base_converter.ConverterInterface):
                     if op.output[0] in self._consumers:
                         consumer = self._consumers[op.output[0]][0]
                         # if there is a shape op, remove it too
-                        if (consumer.input[1] in self._producer
-                            and self._producer[consumer.input[1]].type
-                                == 'Shape'):
-                            self.safe_remove_node(
-                                self._producer[consumer.input[1]], None,
-                                remove_input_tensor=True)
+                        if len(consumer.input) > 1:
+                            if (consumer.input[1] in self._producer
+                                and self._producer[consumer.input[1]].type
+                                    == 'Shape'):
+                                self.safe_remove_node(
+                                    self._producer[consumer.input[1]], None,
+                                    remove_input_tensor=True)
                         # remove consumer reshape
                         self.safe_remove_node(consumer, op,
                                               remove_input_tensor=True)