sum op , elementwise op

e2c59ddf · eclipsess · 4e9454a5 · e2c59ddf · e2c59ddf · e2c59ddf
18 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -62,6 +62,8 @@ const char *G_OP_TYPE_CRF = "crf_decoding";
 const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
 const char *G_OP_TYPE_FLATTEN = "flatten";
 const char *G_OP_TYPE_SHAPE = "shape";
+const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
+const char *G_OP_TYPE_SUM = "sum";
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
@@ -115,7 +117,8 @@ std::unordered_map<
        {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -126,6 +126,8 @@ extern const char *G_OP_TYPE_REGION;
 extern const char *G_OP_TYPE_FUSION_CONV_BN;
 extern const char *G_OP_TYPE_CONV_TRANSPOSE;
 extern const char *G_OP_TYPE_PRELU;
+extern const char *G_OP_TYPE_SUM;
+extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;

--- a/src/framework/mixed_vector.h
+++ b/src/framework/mixed_vector.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <initializer_list>
+#include <vector>
+#include "framework/tensor.h"
+#include "framework/tensor_util.h"
+namespace paddle_mobile {
+namespace framework {
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+  // Default ctor. Create empty Vector
+  Vector() { InitEmpty(); }
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T& value = T()) {
+    InitEmpty();
+    if (count != 0) {
+      resize(count);
+      T* ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U>& dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
+  }
+  // Copy ctor
+  Vector(const Vector<T>& other) { this->operator=(other); }
+  // Copy operator
+  Vector<T>& operator=(const Vector<T>& other) {
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
+    return *this;
+  }
+  // Move ctor
+  Vector(Vector<T>&& other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }
+  // CPU data access method. Mutable.
+  T& operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T*>(cpu_vec_.data<T>())[i];
+  }
+  // CPU data access method. Immutable.
+  const T& operator[](size_t i) const {
+    //    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return size_; }
+  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  T* end() {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
+  T& front() { return *begin(); }
+  T& back() {
+    auto it = end();
+    --it;
+    return *it;
+  }
+  const T* begin() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+  }
+  const T* end() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
+  const T* cbegin() const { return begin(); }
+  const T* cend() const { return end(); }
+  const T& back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }
+  T* data() { return begin(); }
+  const T* data() const { return begin(); }
+  const T& front() const { return *begin(); }
+  // end of std::vector iterator methods
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    InitByIter(end - begin, begin, end);
+  }
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T* ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
+  }
+  // resize the vector
+  void resize(size_t size) {
+    if (size + 1 <= capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      T* ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}));
+      const T* old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
+    }
+  }
+  // clear
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
+  }
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }
+  // reserve data
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }
+  bool operator==(const Vector<T>& other) const {
+    if (size() != other.size()) return false;
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
+  }
+ private:
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    T* ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}));
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+  void MutableCPU() { flag_ = kDirty | kDataInCPU; }
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
+  static T& EmptyDummy() {
+    static T dummy = T();
+    return dummy;
+  }
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/selected_rows.cc
+++ b/src/framework/selected_rows.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/selected_rows.h"
+namespace paddle_mobile {
+namespace framework {
+struct ReAllocateVisitor {
+  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
+      : tensor_(tensor), dims_(dims) {}
+  template <typename T>
+  void operator()() const {
+    framework::Tensor cpu_tensor;
+    T* ptr = cpu_tensor.mutable_data<T>(dims_);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+  framework::Tensor* tensor_;
+  framework::DDim dims_;
+};
+// TensorCopyVisitor(value, i * value_width, *value_.get(),
+//    index * value_width, value_width));
+struct TensorCopyVisitor {
+  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
+                    const framework::Tensor src, int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+  template <typename T>
+  void operator()() const {
+    // TODO(Yancey1989): support other place
+    memory::Copy(dst_->mutable_data<T>() + dst_offset_,
+                 src_.data<T>() + src_offset_, size_ * sizeof(T));
+  }
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  framework::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+// std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
+//                                       framework::Tensor* value) const {
+//  PADDLE_MOBILE_ENFORCE(value->IsInitialized(),
+//                 "The value tensor should be initialized.");
+//  std::vector<int64_t> non_keys;
+//  int64_t value_width = value_->numel() / value_->dims()[0];
+//  PADDLE_MOBILE_ENFORCE(value_width == value->numel() / value->dims()[0],
+//                    "output tensor should have the same shape with table "
+//                    "execpt the dims[0].");
+//
+//  for (size_t i = 0; i < keys.size(); ++i) {
+//    int64_t index = Index(keys[i]);
+//    if (index == -1) {
+//      non_keys.push_back(keys[i]);
+//    } else {
+//      framework::VisitDataType(
+//          framework::ToDataType(value_->type()),
+//          TensorCopyVisitor(value, i * value_width, *value_.get(),
+//                            index * value_width, value_width));
+//    }
+//  }
+//  return non_keys;
+//}
+// bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
+//  PADDLE_MOBILE_ENFORCE(value.IsInitialized(), "The value should be
+//  initialized."); if (value_->IsInitialized()) {
+//    PADDLE_MOBILE_ENFORCE(
+//        value.type() == value_->type(),
+//        "The type of the value should be same with the original value");
+//  }
+//  PADDLE_MOBILE_ENFORCE(value.dims()[0] == static_cast<size_t>(1),
+//                    "The first dim of value should be 1.");
+//  auto index = Index(key);
+//  bool is_new_key = false;
+//  if (index == -1) {
+//    rows_.push_back(key);
+//    index = rows_.size() - 1;
+//    is_new_key = true;
+//    // whether need to resize the table
+//    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
+//      auto dims = value_->dims();
+//      dims[0] = (dims[0] + 1) << 1;
+//      framework::VisitDataType(framework::ToDataType(value.type()),
+//                               ReAllocateVisitor(value_.get(), dims));
+//    }
+//  }
+//
+//  framework::VisitDataType(
+//      framework::ToDataType(value.type()),
+//      TensorCopyVisitor(value_.get(),
+//                        index * value_->numel() / value_->dims()[0], value,
+//                        static_cast<int64_t>(0), value.numel()));
+//  return is_new_key;
+//}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/selected_rows.h
+++ b/src/framework/selected_rows.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "framework/lod_tensor.h"
+#include "framework/tensor.h"
+#include "memory/t_malloc.h"
+#include "mixed_vector.h"
+namespace paddle_mobile {
+namespace framework {
+class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`
+   * number,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+  }
+  // platform::Place place() const { return value_->place(); }
+  const Tensor& value() const { return *value_; }
+  Tensor* mutable_value() { return value_.get(); }
+  int64_t height() const { return height_; }
+  void set_height(int64_t height) { height_ = height; }
+  const Vector<int64_t>& rows() const { return rows_; }
+  Vector<int64_t>* mutable_rows() { return &rows_; }
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+  /*
+   * @brief wheter has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+  /*
+   * @brief Get value by the key list, if the
+   *
+   * @return a list of keys which does not exists in table
+   */
+  std::vector<int64_t> Get(std::vector<int64_t> keys,
+                           framework::Tensor* tensor) const;
+  /*
+   * @brief Set a key-value pair into the table.
+   *  This function will double the value memory if it's not engouth.
+   *
+   * @note:
+   *    1. The first dim of the value should be 1
+   *    2. The value should be initialized and the data type
+   *       should be the same with the table.
+   *
+   * @return true if the key is a new one, otherwise false
+   *
+   */
+  bool Set(int64_t key, const Tensor& value);
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      return static_cast<int64_t>(-1);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simply concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  Vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/operators/elementwise_mul_op.cpp
+++ b/src/operators/elementwise_mul_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEMUL_OP
+#include "elementwise_mul_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void ElementwiseMulOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/elementwise_mul_op.h
+++ b/src/operators/elementwise_mul_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEMUL_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "kernel/elementwise_mul_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ElementwiseMulOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseMulParam<DeviceType>,
+                             operators::ElementwiseMulKernel<DeviceType, T>> {
+ public:
+  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseMulParam<DeviceType>,
+            operators::ElementwiseMulKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, ElementwiseMulParam<DeviceType>,
+      operators::ElementwiseMulKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(elementwise_mul);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_mul);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEMUL_OP
+#include "operators/kernel/elementwise_mul_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_mul_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
+  return true;
+}
+template <>
+void ElementwiseMulKernel<CPU, float>::Compute(
+    const ElementwiseMulParam<CPU> &param) const {
+  ElementwiseMulCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/sum_kernel.cpp
+++ b/src/operators/kernel/arm/sum_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SUM_OP
+#include "operators/kernel/sum_kernel.h"
+#include "operators/kernel/central-arm-func/sum_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SumKernel<CPU, float>::Init(SumParam<CPU> *param) {
+  return true;
+}
+template <>
+void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) const {
+  SumCompute<float>(param);
+  param.Out()->set_lod(param.Inputs()[0]->lod());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEMUL_OP
+#pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+struct MulFunctor {
+  inline T operator()(T a, T b) const { return a * b; }
+};
+template <typename P>
+void ElementwiseMulCompute(const ElementwiseMulParam<CPU> &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<MulFunctor<float>, float>(input_x, input_y, axis,
+                                                 MulFunctor<float>(), Out);
+}
+template class ElementwiseMulKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sum_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <operators/math/selected_rows_functor.h>
+#ifdef SUM_OP
+#pragma once
+namespace paddle_mobile {
+namespace operators {
+using LoDTensorArray = std::vector<LoDTensor>;
+template <typename P>
+void SumCompute(const SumParam<CPU> &param) {
+  auto inputsvars = param.InputsVars();
+  int N = inputsvars.size();
+  auto *outvar = param.OutVar();
+  bool in_place = outvar == inputsvars[0];
+  DLOG << "11:";
+  if (outvar->IsType<framework::LoDTensor>()) {
+    auto *out = outvar->GetMutable<LoDTensor>();
+    if (!in_place) {
+      out->mutable_data<float>();
+    }
+    DLOG << "1:";
+    auto *outptr = out->data<float>();
+    // auto result = Flatten(*out);
+    if (!in_place) {
+      std::fill(out->data<float>(), out->data<float>() + out->numel(), 0);
+    }
+    math::SelectedRowsAddToTensor<float> functor;
+    for (int i = in_place ? 1 : 0; i < N; i++) {
+      if (inputsvars[i]->IsType<framework::LoDTensor>()) {
+        auto *in_t = inputsvars[i]->Get<framework::LoDTensor>();
+        auto *inptr = in_t->data<float>();
+        if (in_t->numel() == 0) {
+          continue;
+        }
+        for (int j = 0; j < out->numel(); ++j) {
+          outptr[j] = outptr[j] + inptr[j];
+        }
+      } else if (inputsvars[i]->IsType<framework::SelectedRows>()) {
+        auto *in_t = inputsvars[i]->Get<framework::SelectedRows>();
+        functor(*in_t, out);
+      } else {
+        PADDLE_MOBILE_THROW_EXCEPTION(
+            "Variable type must be LoDTensor/SelectedRows.");
+      }
+    }
+  } else if (outvar->IsType<framework::SelectedRows>()) {
+    DLOG << "2:";
+    std::unique_ptr<framework::SelectedRows> in0;
+    if (in_place) {
+      // If is in_place, we store the input[0] to in0
+      auto *in_sel0 = inputsvars[0]->Get<SelectedRows>();
+      auto &rows = in_sel0->rows();
+      //#ifdef PADDLE_WITH_CUDA
+      //                    std::vector<int64_t> rows_in_cpu;
+      //        rows_in_cpu.reserve(rows.size());
+      //        for (auto item : rows) {
+      //          rows_in_cpu.push_back(item);
+      //        }
+      //        in0.reset(new framework::SelectedRows(rows_in_cpu,
+      //        in_sel0.height()));
+      //#else
+      in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
+      //#endif
+      in0->mutable_value()->ShareDataWith(in_sel0->value());
+    }
+    auto get_selected_row = [&](size_t i) -> const SelectedRows & {
+      if (i == 0 && in0) {
+        return *in0.get();
+      } else {
+        return *(inputsvars[i]->Get<SelectedRows>());
+      }
+    };
+    auto *out = outvar->GetMutable<SelectedRows>();
+    out->mutable_rows()->clear();
+    auto *out_value = out->mutable_value();
+    // Runtime InferShape
+    size_t first_dim = 0;
+    for (int i = 0; i < N; i++) {
+      auto &sel_row = get_selected_row(i);
+      first_dim += sel_row.rows().size();
+    }
+    auto in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+    in_dim[0] = static_cast<int64_t>(first_dim);
+    out_value->Resize(framework::make_ddim(in_dim));
+    // if all the input sparse vars are empty, no need to
+    // merge these vars.
+    if (first_dim == 0UL) {
+      return;
+    }
+    out_value->mutable_data<float>();
+    math::SelectedRowsAddTo<float> functor;
+    int64_t offset = 0;
+    for (int i = 0; i < N; i++) {
+      auto &sel_row = get_selected_row(i);
+      if (sel_row.rows().size() == 0) {
+        continue;
+      }
+      PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height());
+      functor(sel_row, offset, out);
+      offset += sel_row.value().numel();
+    }
+  } else if (outvar->IsType<LoDTensorArray>()) {
+    DLOG << "3:";
+    auto &out_array = *outvar->GetMutable<LoDTensorArray>();
+    for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) {
+      PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType<LoDTensorArray>(),
+                            "Only support all inputs are TensorArray");
+      auto *in_array = inputsvars[i]->Get<LoDTensorArray>();
+      for (size_t i = 0; i < in_array->size(); ++i) {
+        if ((*in_array)[i].numel() != 0) {
+          if (i >= out_array.size()) {
+            out_array.resize(i + 1);
+          }
+          if (out_array[i].numel() == 0) {
+            framework::TensorCopy((*in_array)[i], &out_array[i]);
+            out_array[i].set_lod((*in_array)[i].lod());
+          } else {
+            PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod());
+            auto *inptr = (*in_array)[i].data<float>();
+            auto *outptr = out_array[i].data<float>();
+            for (int j = 0; j < (*in_array)[i].numel(); ++j) {
+              outptr[j] = inptr[j] + outptr[j];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    DLOG << "2:";
+    if (outvar->IsType<framework::Tensor>()) {
+      DLOG << "3: ";
+    }
+    PADDLE_MOBILE_THROW_EXCEPTION(
+        "Unexpected branch, output variable type is %s", outvar->Type().name());
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/elementwise_mul_kernel.h
+++ b/src/operators/kernel/elementwise_mul_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEMUL_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using namespace framework;
+template <typename DeviceType, typename T>
+class ElementwiseMulKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseMulParam<DeviceType>> {
+ public:
+  void Compute(const ElementwiseMulParam<DeviceType> &param) const;
+  bool Init(ElementwiseMulParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/sum_kernel.h
+++ b/src/operators/kernel/sum_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SUM_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using namespace framework;
+template <typename DeviceType, typename T>
+class SumKernel
+    : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
+ public:
+  void Compute(const SumParam<DeviceType> &param) const;
+  bool Init(SumParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/selected_rows_functor.cc
+++ b/src/operators/math/selected_rows_functor.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <set>
+#include "operators/math/math_function.h"
+#include "operators/math/selected_rows_functor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// template <typename T>
+// struct SelectedRowsAdd<T> {
+//  void operator()(
+//                  const framework::SelectedRows& input1,
+//                  const framework::SelectedRows& input2,
+//                  framework::SelectedRows* output) {
+//    auto in1_height = input1.height();
+//    PADDLE_MOBILE_ENFORCE(in1_height == input2.height());
+//    output->set_height(in1_height);
+//
+//    auto& in1_rows = input1.rows();
+//    auto& in2_rows = input2.rows();
+//    std::vector<int64_t> out_rows;
+//    out_rows.reserve(in1_rows.size() + in2_rows.size());
+//
+//    // concat rows
+//    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+//    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+//    output->set_rows(out_rows);
+//
+//    auto* out_value = output->mutable_value();
+//    auto& in1_value = input1.value();
+//    auto& in2_value = input2.value();
+//
+//    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+//    PADDLE_MOBILE_ENFORCE(in1_row_numel == in2_value.numel() /
+//    in2_rows.size());
+//      PADDLE_MOBILE_ENFORCE(in1_row_numel == out_value->numel() /
+//      out_rows.size());
+//
+////    auto in1_place = input1.place();
+////      PADDLE_MOBILE_ENFORCE(platform::is_cpu_place(in1_place));
+////    auto in2_place = input2.place();
+////      PADDLE_MOBILE_ENFORCE(platform::is_cpu_place(in2_place));
+////    auto out_place = context.GetPlace();
+////      PADDLE_MOBILE_ENFORCE(platform::is_cpu_place(out_place));
+//
+//    auto* out_data = out_value->data<T>();
+//    auto* in1_data = in1_value.data<T>();
+//    memory::Copy(out_data, in1_data,
+//                 in1_value.numel() * sizeof(T));
+//
+//    auto* in2_data = in2_value.data<T>();
+//    memory::Copy(
+//                 out_data + in1_value.numel(),
+//                 in2_data,
+//                 in2_value.numel() * sizeof(T));
+//  }
+//};
+//
+// template struct SelectedRowsAdd<float>;
+// template struct SelectedRowsAdd<double>;
+////
+////template <typename T>
+////struct SelectedRowsAddTensor<T> {
+////  void operator()(
+////                  const framework::SelectedRows& input1,
+////                  const framework::Tensor& input2, framework::Tensor*
+/// output) { /    auto in1_height = input1.height(); /    auto in2_dims =
+/// input2.dims(); /    auto out_dims = output->dims(); /
+/// PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0]); /
+/// PADDLE_MOBILE_ENFORCE(in1_height == out_dims[0]);
+////
+////    auto& in1_value = input1.value();
+////    auto& in1_rows = input1.rows();
+////
+////    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+////      PADDLE_MOBILE_ENFORCE(in1_row_numel == input2.numel() / in1_height);
+////      PADDLE_MOBILE_ENFORCE(in1_row_numel == output->numel() / in1_height);
+////
+////    SetConstant<T> functor;
+////    functor(output, 0.0);
+////
+////    auto* in1_data = in1_value.data<T>();
+////    auto* out_data = output->data<T>();
+////
+////    for (size_t i = 0; i < in1_rows.size(); i++) {
+////      for (int64_t j = 0; j < in1_row_numel; j++) {
+////        out_data[in1_rows[i] * in1_row_numel + j] +=
+////            in1_data[i * in1_row_numel + j];
+////      }
+////    }
+////
+////    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+////    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+////    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
+////  }
+////};
+////
+////template struct SelectedRowsAddTensor< float>;
+////template struct SelectedRowsAddTensor<double>;
+//
+// template <typename T>
+// struct SelectedRowsAddTo {
+//  void operator()(
+//                  const framework::SelectedRows& input1,
+//                  const int64_t input2_offset,
+//                  framework::SelectedRows* input2) {
+//    auto in1_height = input1.height();
+//      PADDLE_MOBILE_ENFORCE(in1_height == input2->height());
+//
+//    auto& in1_rows = input1.rows();
+//    auto& in2_rows = *(input2->mutable_rows());
+//
+//    auto& in1_value = input1.value();
+//    auto* in2_value = input2->mutable_value();
+//
+//    // concat rows
+//    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+//
+////    auto in1_place = input1.place();
+////    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+////    auto in2_place = input2->place();
+////    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+//
+//    auto* in1_data = in1_value.data<T>();
+//    auto* in2_data = in2_value->data<T>();
+//    memory::Copy(
+//                 in2_data + input2_offset,
+//                  in1_data,
+//                 in1_value.numel() * sizeof(T));
+//  }
+//};
+//
+// template struct SelectedRowsAddTo<float>;
+// template struct SelectedRowsAddTo<double>;
+// template struct SelectedRowsAddTo<int>;
+// template struct SelectedRowsAddTo<int64_t>;
+//
+// template <typename T>
+// struct SelectedRowsAddToTensor<T> {
+//  void operator()(const framework::SelectedRows& input1,
+//                  framework::Tensor* input2) {
+//    auto in1_height = input1.height();
+//    auto in2_dims = input2->dims();
+//      PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0]);
+//
+//    auto& in1_value = input1.value();
+//    auto& in1_rows = input1.rows();
+//
+//    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+//      PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height);
+//
+//    auto* in1_data = in1_value.data<T>();
+//    auto* input2_data = input2->data<T>();
+//
+//    for (size_t i = 0; i < in1_rows.size(); i++) {
+//      for (int64_t j = 0; j < in1_row_numel; j++) {
+//        input2_data[in1_rows[i] * in1_row_numel + j] +=
+//            in1_data[i * in1_row_numel + j];
+//      }
+//    }
+//  }
+//};
+//
+// template struct SelectedRowsAddToTensor< float>;
+// template struct SelectedRowsAddToTensor<double>;
+// template struct SelectedRowsAddToTensor< int>;
+// template struct SelectedRowsAddToTensor< int64_t>;
+//
+//// This is a separated namespace for manipulate SelectedRows typed
+//// data. Like merge duplicated rows, adding two SelectedRows etc.
+////
+//// Another group of functors is called "scatter updates", which means
+//// use SelectedRows to update a dense tensor with different Ops, like
+//// add or mul.
+//
+////namespace scatter {
+////
+////size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+////  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+////}
+//
+////template <typename T>
+////struct MergeAdd<platform::CPUDeviceContext, T> {
+////  framework::SelectedRows operator()(const platform::CPUDeviceContext&
+/// context, /                                     const
+/// framework::SelectedRows&  input) { /    framework::SelectedRows out; / auto
+/// input_rows =  input.rows(); /    std::set<int64_t>
+/// row_set(input_rows.begin(), input_rows.end()); /    std::vector<int64_t>
+/// merge_rows(row_set.begin(), row_set.end());
+////
+////    auto input_width = input.value().dims()[1];
+////    out.set_rows(merge_rows);
+////    out.set_height(input.height());
+////    out.mutable_value()->mutable_data<T>(
+////        framework::make_ddim(
+////            {static_cast<int64_t>(merge_rows.size()), input_width}),
+////        context.GetPlace());
+////
+////    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+////    constant_functor(context, out.mutable_value(), 0.0);
+////
+////    auto* out_data = out.mutable_value()->data<T>();
+////    auto* input_data = input.value().data<T>();
+////
+////    for (size_t i = 0; i < input_rows.size(); i++) {
+////      size_t out_i = FindPos(merge_rows, input_rows[i]);
+////      for (int64_t j = 0; j < input_width; j++) {
+////        out_data[out_i * input_width + j] += input_data[i * input_width +
+/// j]; /      } /    } /    return out; /  }
+////};
+////
+////template struct MergeAdd<platform::CPUDeviceContext, float>;
+////template struct MergeAdd<platform::CPUDeviceContext, double>;
+////template struct MergeAdd<platform::CPUDeviceContext, int>;
+////template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+////
+////template <typename T>
+////struct UpdateToTensor<platform::CPUDeviceContext, T> {
+////  void operator()(const platform::CPUDeviceContext& context,
+////                  const ScatterOps& op, const framework::SelectedRows&
+/// input1, /                  framework::Tensor* input2) { /    auto in1_height
+///= input1.height(); /    auto in2_dims = input2->dims(); /
+/// PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+////
+////    auto& in1_value = input1.value();
+////    auto& in1_rows = input1.rows();
+////
+////    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+////    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+////
+////    auto* in1_data = in1_value.data<T>();
+////    auto* input2_data = input2->data<T>();
+////
+////    // FIXME(typhoonzero): use macro fix the below messy code.
+////    switch (op) {
+////      case ScatterOps::ASSIGN:
+////        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+////        input2_data[in1_rows[i] * in1_row_numel + j] =
+////            in1_data[i * in1_row_numel + j];
+////        break;
+////      case ScatterOps::ADD:
+////        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+////        input2_data[in1_rows[i] * in1_row_numel + j] +=
+////            in1_data[i * in1_row_numel + j];
+////        break;
+////      case ScatterOps::SUB:
+////        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+////        input2_data[in1_rows[i] * in1_row_numel + j] -=
+////            in1_data[i * in1_row_numel + j];
+////        break;
+////      case ScatterOps::SUBBY:
+////        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+////        input2_data[in1_rows[i] * in1_row_numel + j] =
+////            in1_data[i * in1_row_numel + j] -
+////            input2_data[in1_rows[i] * in1_row_numel + j];
+////        break;
+////      case ScatterOps::MUL:
+////        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+////        input2_data[in1_rows[i] * in1_row_numel + j] *=
+////            in1_data[i * in1_row_numel + j];
+////        break;
+////      case ScatterOps::DIV:
+////        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+////        input2_data[in1_rows[i] * in1_row_numel + j] /=
+////            in1_data[i * in1_row_numel + j];
+////        break;
+////      case ScatterOps::DIVBY:
+////        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+////        input2_data[in1_rows[i] * in1_row_numel + j] =
+////            in1_data[i * in1_row_numel + j] /
+////            input2_data[in1_rows[i] * in1_row_numel + j];
+////        break;
+////    }
+////  }
+////};
+//
+//  // namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/selected_rows_functor.h
+++ b/src/operators/math/selected_rows_functor.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/selected_rows.h"
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// SelectedRows + SelectedRows will simplely concat value and rows.
+// The real computation happens in dealing with LoDTensor.
+// template <typename T>
+// struct SelectedRowsAdd {
+//  void operator()(
+//                  const framework::SelectedRows& input1,
+//                  const framework::SelectedRows& input2,
+//                  framework::SelectedRows* output);
+//};
+//
+// template <typename T>
+// struct SelectedRowsAddTensor {
+//  void operator()(
+//                  const framework::SelectedRows& input1,
+//                  const framework::Tensor& input2, framework::Tensor* output);
+//};
+// input2 = input1 + input2
+template <typename T>
+struct SelectedRowsAddTo {
+  void operator()(const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_MOBILE_ENFORCE(in1_height == input2->height());
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+    // concat rows
+    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
+    //    auto in1_place = input1.place();
+    //    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    //    auto in2_place = input2->place();
+    //    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(in2_data + input2_offset, in1_data,
+                 in1_value.numel() * sizeof(T));
+  }
+};
+// input2 = input1 + input2
+template <typename T>
+struct SelectedRowsAddToTensor {
+  void operator()(const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0]);
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height);
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+// namespace scatter {
+//// functors for manuplating SelectedRows data
+// template <typename T>
+// struct MergeAdd {
+//  // unary functor, merge by adding duplicated rows in
+//  // the input SelectedRows object.
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input);
+//};
+// template <typename T>
+// struct Add {
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input1,
+//                                     const framework::SelectedRows& input2) {
+//    framework::SelectedRows out;
+//    out.set_rows(input1.rows());
+//    out.set_height(input1.height());
+//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+//                                         );
+//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+//    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
+//    return out;
+//  }
+//};
+// template <typename T>
+// struct Mul {
+//  // multiply two SelectedRows
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input1,
+//                                     const framework::SelectedRows& input2) {
+//    framework::SelectedRows out;
+//    out.set_rows(input1.rows());
+//    out.set_height(input1.height());
+//    out.mutable_value()->mutable_data<T>(input1.value().dims()
+//                                         );
+//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+//    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
+//    return out;
+//  }
+//  // multiply scalar to SelectedRows
+//  framework::SelectedRows operator()(
+//                                     const framework::SelectedRows& input1,
+//                                     const T input2) {
+//    framework::SelectedRows out;
+//    out.set_rows(input1.rows());
+//    out.set_height(input1.height());
+//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+//                                         );
+//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+//    e_out.device(*context.eigen_device()) = input2 * e_in1;
+//    return out;
+//  }
+//};
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+// out = seleted_rows_in / tensor
+template <typename T>
+struct UpdateToTensor {
+  void operator()(const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+// namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -35,6 +35,7 @@ using framework::AttributeMap;
 using framework::LoDTensor;
 using framework::Scope;
 using framework::Tensor;
+using framework::Variable;
 using std::string;
 using std::vector;
@@ -182,6 +183,11 @@ class OpParam {
    return GetMultiVarValue<T>("X", inputs, scope);
  }
+  static vector<Variable *> InputMultiVarsFrom(const VariableNameMap &inputs,
+                                               const Scope &scope) {
+    return GetMultiVar("X", inputs, scope);
+  }
  template <typename T>
  static T *OutputBatchGateFrom(const VariableNameMap &outputs,
                                const Scope &scope) {
@@ -216,6 +222,11 @@ class OpParam {
    return GetVarValue<T>("Output", outputs, scope);
  }
+  static Variable *OutVarFrom(const VariableNameMap &outputs,
+                              const Scope &scope) {
+    return GetVar("Out", outputs, scope);
+  }
  template <typename T>
  static T *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
    return GetVarValue<T>("Out", outputs, scope);
@@ -286,6 +297,19 @@ class OpParam {
    }
  }
+  static Variable *GetVar(const string &key, const VariableNameMap &var_map,
+                          const Scope &scope) {
+    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
+                          "%s is not contained in var_map", key.c_str())
+    auto var_vec = var_map.at(key);
+    if (!var_vec.empty()) {
+      auto var = scope.FindVar(var_vec[0]);
+      return var;
+    } else {
+      return nullptr;
+    }
+  }
  static std::string getkey(const string &key, const VariableNameMap &var_map,
                            int index) {
    auto var_vec = var_map.at(key);
@@ -319,6 +343,19 @@ class OpParam {
    }
    return var_res;
  }
+  static vector<Variable *> GetMultiVar(const string &key,
+                                        const VariableNameMap &var_map,
+                                        const Scope &scope) {
+    auto var_vecs = var_map.at(key);
+    assert(var_vecs.size() > 1);
+    vector<Variable *> var_res;
+    for (auto &var_vec : var_vecs) {
+      auto var = scope.FindVar(var_vec);
+      var_res.push_back(var);
+    }
+    return var_res;
+  }
 };
 template <typename Dtype>
@@ -405,6 +442,45 @@ class ElementwiseAddParam : OpParam {
 #endif
 };
+template <typename Dtype>
+class ElementwiseMulParam : OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  ElementwiseMulParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+  }
+  const GType *InputX() const { return input_x_; }
+  const GType *InputY() const { return input_y_; }
+  GType *Out() const { return out_; }
+  const int &Axis() const { return axis_; }
+ private:
+  GType *input_x_;
+  GType *input_y_;
+  GType *out_;
+  int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+ private:
+  fpga::EWAddArgs fpga_EW_mul_args;
+ public:
+  const fpga::EWMulArgs &FpgaArgs() const { return fpga_EW_mul_args; }
+  void SetFpgaArgs(const fpga::EWMulArgs &args) { fpga_EW_mul_args = args; }
+#endif
+};
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
 template <typename Dtype>
 using ElementwiseAddReluParam = ElementwiseAddParam<Dtype>;
@@ -490,6 +566,46 @@ class ConcatParam : public OpParam {
 };
 #endif
+#ifdef SUM_OP
+template <typename Dtype>
+class SumParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  SumParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+           const AttributeMap &attrs, const Scope &scope) {
+    inputs_vars_ = InputMultiVarsFrom(inputs, scope);
+    out_var_ = OutVarFrom(outputs, scope);
+    inputs_ = InputMultiFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+  vector<Variable *> InputsVars() const { return inputs_vars_; }
+  Variable *OutVar() const { return out_var_; }
+  vector<GType *> Inputs() const { return inputs_; }
+  GType *Out() const { return out_; }
+ private:
+  vector<Variable *> inputs_vars_;
+  Variable *out_var_;
+  vector<GType *> inputs_;
+  GType *out_;
+#ifdef PADDLE_MOBILE_FPGA
+ private:
+  fpga::SumArgs fpga_sum_args;
+ public:
+  const fpga::SumArgs &FpgaArgs() const { return fpga_sum_args; }
+  void SetFpgaArgs(const fpga::SumArgs &args) { fpga_sum_args = args; }
+#endif
+};
+#endif
 #ifdef LRN_OP
 template <typename Dtype>
 class LrnParam : public OpParam {

--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SUM_OP
+#include <vector>
+#include "operators/sum_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void SumOp<Dtype, T>::InferShape() const {
+  auto inputs = this->param_.Inputs();
+  const size_t n = inputs.size();
+  std::vector<DDim> inputs_dims;
+  inputs_dims.reserve(n);
+  for (int i = 0; i < n; i++) {
+    inputs_dims.push_back(inputs[i]->dims());
+  }
+  if (n == 1) {
+    DLOG << "Warning: sum op have only one input, "
+            "may waste memory";
+  }
+  framework::DDim in_dim({0});
+  for (auto& x_dim : inputs_dims) {
+    if (framework::product(x_dim) == 0) {
+      continue;
+    }
+    if (framework::product(in_dim) == 0) {
+      in_dim = x_dim;
+    } else {
+      PADDLE_MOBILE_ENFORCE(in_dim == x_dim,
+                            "input tensors must have same shape");
+    }
+  }
+  this->param_.Out()->Resize(in_dim);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(sum, ops::SumOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(sum, ops::ConcatOp);
+#endif
+#endif
--- a/src/operators/sum_op.h
+++ b/src/operators/sum_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SUM_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/sum_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class SumOp : public framework::OperatorWithKernel<
+                  DeviceType, SumParam<DeviceType>,
+                  operators::SumKernel<DeviceType, T>> {
+ public:
+  SumOp(const string &type, const VariableNameMap &inputs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, SumParam<DeviceType>,
+                                      operators::SumKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, SumParam<DeviceType>,
+      operators::SumKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(sum);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(sum);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(sum);
+#endif
+#endif