diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 440a924a7a63d88d9ffd8cd357cdd321d3234270..84209265ce7634121e3e4dde609cd787093c45ec 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -40,7 +40,6 @@ enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
 /**
  * \brief BufferArg used as the argument type of Function.
@@ -51,6 +50,11 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  * 3. SequenceArg for a Buffer of sequence data.
  * 4. SparseMatrixArg for a Buffer of sparse matrix.
  *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
  * There is an ArgType property for the BufferArg used as Function Output.
  * Whether the result of the Function calculation is assigned to the
  * output Buffer or added to the output Buffer is determined by the
@@ -72,6 +76,14 @@ public:
   ArgType getArgType() const { return argType_; }
 
 public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr),
+        valueType_(valueType),
+        shape_(shape),
+        argType_(argType) {}
+
   BufferArg(void* buf,
             ValueType valueType,
             const TensorShape& shape,
@@ -177,6 +189,13 @@ protected:
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    CHECK_EQ(shape_.ndims(), (size_t)1);
+    CHECK_GT(shape_[0], 1);
+    numSeqs_ = shape_[0] - 1;
+  }
+
   SequenceIdArg(void* buf,
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
@@ -199,9 +218,18 @@ private:
   size_t numSeqs_;
 };
 
-// sequence data {seqId(vec), buf(matrix)}
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
 class SequenceArg : public BufferArg {
 public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType), startPositions_(TensorShape()) {}
+
   SequenceArg(void* buf,
               ValueType valueType,
               const TensorShape& shape,
@@ -223,7 +251,8 @@ public:
 
   void* getIdBuf() const { return startPositions_.data(); }
   size_t numSeqs() const { return startPositions_.numSeqs(); }
-  const SequenceIdArg& getSequenceIds() const { return startPositions_; }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
 
 private:
   SequenceIdArg startPositions_;
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index b345597435c9911ce95b596f5f7f2add47f4cd03..1744f377808f137dcda4a28acce336dc22be3d01 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "BufferArg.h"
 #include <gtest/gtest.h>
-#include "Function.h"
 #include "paddle/math/MemoryHandle.h"
-#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -37,55 +35,4 @@ TEST(BufferTest, SequenceIdArg) {
   EXPECT_EQ(buffer.numSeqs(), 9);
 }
 
-TEST(BufferTest, asArgument) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  VectorPtr vector = Vector::create(100, false);
-  CpuSparseMatrix sparse(200, 300, 50);
-
-  // prepare arguments
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  argments.addArg(*vector);
-  argments.addArg(sparse);
-
-  // function
-  auto function = [=](const BufferArgs& inputs) {
-    EXPECT_EQ(inputs.size(), 3);
-
-    // check inputs[0]
-    EXPECT_EQ(inputs[0].shape().ndims(), 2);
-    EXPECT_EQ(inputs[0].shape()[0], 100);
-    EXPECT_EQ(inputs[0].shape()[1], 200);
-    EXPECT_EQ(inputs[0].data(), matrix->getData());
-
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
-              matrix->getHeight());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
-              matrix->getWidth());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-
-    // check inputs[1]
-    EXPECT_EQ(inputs[1].shape().ndims(), 1);
-    EXPECT_EQ(inputs[1].shape()[0], 100);
-    EXPECT_EQ(inputs[1].data(), vector->getData());
-    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-
-    // check inputs[2]
-    EXPECT_EQ(inputs[2].shape().ndims(), 2);
-    EXPECT_EQ(inputs[2].shape()[0], 200);
-    EXPECT_EQ(inputs[2].shape()[1], 300);
-    EXPECT_EQ(inputs[2].data(), sparse.getData());
-    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
-    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
-  };
-
-  // call function
-  function(argments);
-}
-
 }  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 39733479cc55d8d5e88bbe1ddf9fc4a08f3f8f71..a5cf16cb568ee9bafd15a8c9737d933b6fbbd12b 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -19,7 +19,7 @@ if(WITH_TESTING)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(CrossMapNormalOpTest)
     add_simple_unittest(TensorShapeTest)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 2ef53cd6d9dedac38d5c9bff9f9bbaaf597cf6ab..6cd4e4abee8fccf3a4745b0bfc6701df4ddfa5c0 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -115,11 +115,10 @@ public:
     const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
     auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
 
-    CHECK(out_seq.data() && val_seqs.data() &&
-          val_seqs.getSequenceIds().data());
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
     CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
     CHECK_EQ(val_seqs.shape().ndims(), (size_t)2);
-    CHECK_EQ(val_seqs.getSequenceIds().shape().ndims(), (size_t)1);
+    CHECK_EQ(val_seqs.getSequenceId().shape().ndims(), (size_t)1);
     if (2 == inputs.size()) {
       CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
     }
@@ -139,7 +138,7 @@ public:
         (2 == inputs.size())
             ? inputs[1].matrix<Device>()
             : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seq_vec = val_seqs.getSequenceIds().vector<int, Device>();
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
     ContextProjectionForward<Device>(out_mat,
                                      in_mat,
                                      w_mat,
@@ -242,11 +241,11 @@ public:
         << "SequenceArg required here";
     const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
     auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceIds().data());
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
     CHECK_EQ(in_seq.shape().ndims(), (size_t)2);
-    CHECK_EQ(in_seq.getSequenceIds().shape().ndims(), (size_t)1);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), (size_t)1);
     CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
-    CHECK_EQ(out_seq.getSequenceIds().shape().ndims(), (size_t)1);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), (size_t)1);
     CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
 
     /// dim of input grad == dim of weight
@@ -258,7 +257,7 @@ public:
     CHECK_EQ(out_seq.getArgType(), ADD_TO);
     CHECK_EQ(outputs[1].getArgType(), ADD_TO);
 
-    const auto seq_vec = in_seq.getSequenceIds().vector<int, Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
     const auto out_grad_mat = in_seq.matrix<Device>();
     auto in_grad_mat =
         !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
@@ -312,10 +311,10 @@ public:
     const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
     const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
 
-    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceIds().data());
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
     CHECK_EQ(static_cast<int>(out_seq.shape().ndims()), 2);
     CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(in_seq.getSequenceIds().shape().ndims()), 1);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
     /// output layer grad dim == input layer grad dim * context_length_
     CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
     /// input and output has the same batch_size
@@ -323,7 +322,7 @@ public:
     CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
 
     const auto out_grad_mat = in_seq.matrix<Device>();
-    const auto seq_vec = in_seq.getSequenceIds().vector<int, Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
     auto in_grad_mat = out_seq.matrix<Device>();
 
     ContextProjectionBackwardData<Device>(
@@ -360,16 +359,16 @@ public:
     CHECK_EQ(1, static_cast<int>(outputs.size()));
     CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
     const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceIds().data() && outputs[0].data());
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
     CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
     CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(in_seq.getSequenceIds().shape().ndims()), 1);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
     CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
     /// output layer grad dim == weight dim * context_length_
     CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
 
-    const auto seq_vec = in_seq.getSequenceIds().vector<int, Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
     const auto out_grad_mat = in_seq.matrix<Device>();
     auto w_grad_mat = outputs[0].matrix<Device>();
     ContextProjectionBackwardWeight<Device>(out_grad_mat,
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 92980c503fdaaaa9ac600070197dba6ba4bfb7a4..8e7dc72524a7680a03ea6eb4770a3e25c09ad913 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -188,8 +188,13 @@ public:
     CHECK(inputs[0].shape() == inputs[3].shape());
     CHECK(inputs[0].shape() == outputs[0].shape());
 
-    // TODO(hedaoyuan): need support ASSIGN_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
 
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index d65d9310affd7c9b7fee3118c79449870849c243..51f5da81bfc9ae870ac9949ba74da01a9449a04d 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -27,15 +27,19 @@ TEST(CrossMapNormal, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormal",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
-                               {});
+            // init Test object
+            FunctionCompare test("CrossMapNormal",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            // prepare input arguments
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
@@ -53,18 +57,19 @@ TEST(CrossMapNormalGrad, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormalGrad",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims)},
-                               {});
+            FunctionCompare test("CrossMapNormalGrad",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 3b6590846532d8b63cec2b41ca87255c959e036f..f47d55a4ade97d76e0f1940a2234e34e20efade6 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -79,21 +79,25 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
 void BufferArgs::addArg(const Matrix& arg,
                         const TensorShape& shape,
                         ArgType argType) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
+  _args_.push_back(new BufferArg(arg, shape, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const Matrix& matrix,
                         const IVector& vector,
                         ArgType argType) {
-  args_.push_back(std::make_shared<SequenceArg>(matrix, vector, argType));
+  _args_.push_back(new SequenceArg(matrix, vector, argType));
+  addArg(*_args_.back());
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index c15045143bb67382cfa6fad515555accaa5efb59..9215c137eb8e85a9a03575104d7f89bbce441eba 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -50,19 +50,44 @@ protected:
  * Argument type for Function::calc().
  * A BufferArgs contains a set of BufferArg,
  * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
  */
 class BufferArgs {
 public:
   BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
   size_t size() const { return args_.size(); }
 
   // add argument into BufferArgs
   // Tensor can be Matrix, Vector, IVector.
   // For inputs, do not need argType.
   // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  template <typename Tensor>
-  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
-    args_.push_back(std::make_shared<BufferArg>(arg, argType));
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
   }
 
   // Add arg into BufferArgs and reshape the arg.
@@ -87,14 +112,27 @@ public:
     return *args_[num];
   }
 
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
 private:
-  std::vector<BufferArgPtr> args_;
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
 };
 
 /**
  * \brief Base class for Function.
  * The basic Function implementation requires override init and calc interfaces.
  *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
  * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
  * and ADD_TO.
  * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 7ce908320a6f6f764e8fdacc96432aca78d7b2df..fdf7e631e5ab8c67eb5cf906bd0af49740d60112 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "Function.h"
 #include <gtest/gtest.h>
+#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -56,4 +57,110 @@ TEST(Function, BufferArgs) {
   Function<DEVICE_TYPE_GPU>(gpuArgments);
 }
 
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[2], 3);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
 }  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index da4c0f4f077b8d8cb71190cfb84b600e4d52fa33..24e7a36a43cfa8941535cb778aa1557ec5a0a6f4 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -15,57 +15,186 @@ limitations under the License. */
 #include "Function.h"
 #include "paddle/math/Vector.h"
 #include "paddle/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
 
 namespace paddle {
 
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+/**
+ * \brief A class for comparing CPU and GPU implementations of Function.
+ *
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  FunctionCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
 class FunctionCompare {
 public:
   FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpu->init(config);
-    gpu->init(config);
+      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+    cpuFunc_->init(config);
+    gpuFunc_->init(config);
   }
 
-  void cmpWithArg(const BufferArgs& inputs,
-                  const BufferArgs& outputs,
-                  const BufferArgs& inouts) {
-    // init cpu and gpu arguments
-    auto initArgs = [=](
-        BufferArgs& cpuArgs, BufferArgs& gpuArgs, const BufferArgs& inArgs) {
-      /// leave it empty to pass the compile of ContextProjectionTest
-      /// Daoyuan is working on FunctionTest
-      /// and I will further merge with it
-    };
-    initArgs(cpuInputs, gpuInputs, inputs);
-    initArgs(cpuOutputs, gpuOutputs, outputs);
+  ~FunctionCompare() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
+    gpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
+  }
+
+  void addInputs(const SequenceArg& input) {
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
+
+    TensorShape seqsId({numSeqs + 1});
+    // void* cpuBuffer = cpuMemory_.back()->getBuf();
+    // void* gpuBuffer = gpuMemory_.back()->getBuf();
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    // TODO: need be implemented.
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
 
     // function calculate
-    cpu->calc(cpuInputs, cpuOutputs);
-    gpu->calc(gpuInputs, gpuOutputs);
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
+      }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
+      }
+      function->calc(inArgs, outArgs);
+    };
+
+    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
 
     // check outputs and inouts
-    auto checkArgs = [=](const BufferArgs& cpuArgs, const BufferArgs& gpuArgs) {
-      /// leave it open
-    };
-    checkArgs(cpuOutputs, gpuOutputs);
+    compareOutputs();
   }
 
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
 
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+protected:
+  void initInputs() {
+    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+      initArg(*cpuInputs_[i]);
+
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
+                          (real*)cpuInputs_[i]->data());
+      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
+                          (real*)gpuInputs_[i]->data());
+
+      gpuVector.copyFrom(cpuVector);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      auto cpu = cpuOutputs_[i];
+      auto gpu = gpuOutputs_[i];
+      CpuVector cpuVector(cpu->shape().getElements(), (real*)cpu->data());
+      GpuVector gpuVector(cpu->shape().getElements(), (real*)gpu->data());
+
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
+  }
+
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = uniformRandom(
+                    std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
+                1;
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
 
 protected:
-  std::shared_ptr<FunctionBase> cpu;
-  std::shared_ptr<FunctionBase> gpu;
-  std::vector<CpuMemHandlePtr> cpuMemory;
-  std::vector<GpuMemHandlePtr> gpuMemory;
-  BufferArgs cpuInputs;
-  BufferArgs cpuOutputs;
-  BufferArgs cpuInouts;
-  BufferArgs gpuInputs;
-  BufferArgs gpuOutputs;
-  BufferArgs gpuInouts;
+  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> gpuFunc_;
+  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<GpuMemHandlePtr> gpuMemory_;
+  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> gpuOutputs_;
 };
 
 }  // namespace paddle