diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index d787d2814d88bfc8753620e5f51d7a99936e432f..3d28249f69c2bbc6efbd539bcc66dfa1282275bd 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -38,16 +38,40 @@ enum SparseDataType {
 
 enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 
-/**
- * BufferArg used as the argument type for Function.
- */
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
-// an array of arbitrary dimensions
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
 class BufferArg {
+public:
+  // ArgType is only used by output BufferArg.
+  // For input argument, argType_ is ignored.
+  // For output argument, need to set the argType_ of the BufferArg.
+  enum ArgType {
+    UNSPECIFIED = 0,
+    ASSIGN_TO = 1,
+    ADD_TO = 2,
+  };
+
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
 public:
   BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
       : buf_(buf), valueType_(valueType), shape_(shape) {}
@@ -56,7 +80,8 @@ public:
       : buf_(buf), valueType_(valueType) {}
 
   BufferArg(const Matrix& matrix)
-      : buf_(reinterpret_cast<void*>(matrix.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(2) {
     shape_.setDim(0, matrix.getHeight());
@@ -64,21 +89,24 @@ public:
   }
 
   BufferArg(const Matrix& matrix, const TensorShape& shape)
-      : buf_(reinterpret_cast<void*>(matrix.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(shape) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
   BufferArg(const Vector& vector)
-      : buf_(reinterpret_cast<void*>(vector.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(DataType<real>::value),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
   }
 
   BufferArg(const IVector& vector)
-      : buf_(reinterpret_cast<void*>(vector.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(VALUE_TYPE_INT32),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
@@ -124,6 +152,7 @@ protected:
   ValueType valueType_;
   TensorShape shape_;
   BufferType bufferType_;
+  ArgType argType_ = UNSPECIFIED;
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 27ebe808aaf446771afb931e3a4611519cf340f0..88d6824aa393933f29eb62975627b80133a8783c 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -56,12 +56,18 @@ public:
   BufferArgs() {}
   size_t size() const { return args_.size(); }
 
-  // add argument into BufferArgss
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
   template <typename Tensor>
   void addArg(const Tensor& arg) {
     args_.push_back(std::make_shared<BufferArg>(arg));
   }
 
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
   void addArg(const Matrix& arg, const TensorShape& shape);
 
   void addArg(const CpuSparseMatrix& arg);
@@ -78,10 +84,20 @@ private:
 };
 
 /**
- * Base class for Function.
+ * \brief Base class for Function.
  * The basic Function implementation requires override init and calc interfaces.
- * Need to pay attention to the inouts argument. For the input argument
- * that will be modified, it needs to be passed through inouts.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
  */
 class FunctionBase {
 public:
@@ -89,9 +105,7 @@ public:
 
   virtual void init(const FuncConfig& config) {}
 
-  virtual void calc(const BufferArgs& inputs,
-                    const BufferArgs& outputs,
-                    const BufferArgs& inouts) {}
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 };
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 7c3d6684cded1db1801cb18d637bbd4a976298ca..7ce908320a6f6f764e8fdacc96432aca78d7b2df 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -35,7 +35,7 @@ void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
 
 template <DeviceType DType>
 void Function(const BufferArgs& arguments) {
-  auto input = arguments[0].matrix<DType>();
+  const auto input = arguments[0].matrix<DType>();
   auto output = arguments[1].matrix<DType>();
   FunctionApi<DType>(output, input);
 }
diff --git a/paddle/math/Matrix.h~RFbb8b484f.TMP b/paddle/math/Matrix.h~RFbb8b484f.TMP
deleted file mode 100644
index d89b0f67b3c98d7a651f297d703a397c3461b6d8..0000000000000000000000000000000000000000
--- a/paddle/math/Matrix.h~RFbb8b484f.TMP
+++ /dev/null
@@ -1,1870 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/common.h"
-
-namespace paddle {
-
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
-protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
-public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
-public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // asynchronous copy
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   */
-  virtual void classificationError(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
-   */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const {
-    return data_ == nullptr;
-  }
-
-  explicit operator bool() const {
-    return !isEmpty();
-  }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
-public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label);
-
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
-public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
-public:
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
-public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
-private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"