diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 3d28249f69c2bbc6efbd539bcc66dfa1282275bd..6576d18dae99e6f7c4abd8d388e420c22468e129 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -57,58 +57,67 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  * output Buffer or added to the output Buffer is determined by the
  * argType_ property of the output BufferArg.
  */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
 class BufferArg {
 public:
-  // ArgType is only used by output BufferArg.
-  // For input argument, argType_ is ignored.
-  // For output argument, need to set the argType_ of the BufferArg.
-  enum ArgType {
-    UNSPECIFIED = 0,
-    ASSIGN_TO = 1,
-    ADD_TO = 2,
-  };
-
   void setArgType(ArgType argType) { argType_ = argType; }
 
   ArgType getArgType() const { return argType_; }
 
 public:
-  BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
-      : buf_(buf), valueType_(valueType), shape_(shape) {}
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
 
   BufferArg(void* buf, ValueType valueType)
       : buf_(buf), valueType_(valueType) {}
 
-  BufferArg(const Matrix& matrix)
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
-        shape_(2) {
+        shape_(2),
+        argType_(argType) {
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
   }
 
-  BufferArg(const Matrix& matrix, const TensorShape& shape)
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
-        shape_(shape) {
+        shape_(shape),
+        argType_(argType) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
-  BufferArg(const Vector& vector)
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(DataType<real>::value),
-        shape_(1) {
+        shape_(1),
+        argType_(argType) {
     shape_.setDim(0, vector.getSize());
   }
 
-  BufferArg(const IVector& vector)
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(VALUE_TYPE_INT32),
-        shape_(1) {
+        shape_(1),
+        argType_(argType) {
     shape_.setDim(0, vector.getSize());
   }
 
@@ -163,8 +172,10 @@ protected:
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
-  SequenceIdArg(void* buf, const TensorShape& shape)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape) {
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
     CHECK_EQ(shape_.ndims(), 1);
     numSeqs_ = shape_[0] - 1;
   }
@@ -187,11 +198,15 @@ public:
   SequenceArg(void* buf,
               ValueType valueType,
               const TensorShape& shape,
-              const SequenceIdArg& startPositions)
-      : BufferArg(buf, valueType, shape), startPositions_(startPositions) {}
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {}
 
-  SequenceArg(const Matrix& matrix, const IVector& vector)
-      : BufferArg(matrix), startPositions_(vector) {}
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {}
 
   ~SequenceArg() {}
 
@@ -214,8 +229,9 @@ public:
                   const BufferArg& col,
                   size_t nnz,
                   SparseDataFormat format,
-                  SparseDataType type)
-      : BufferArg(buf, valueType, shape),
+                  SparseDataType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
         row_(row),
         col_(col),
         nnz_(nnz),
@@ -232,13 +248,13 @@ public:
     }
   }
 
-  SparseMatrixArg(const CpuSparseMatrix& sparse)
-      : BufferArg(sparse),
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+      : BufferArg(sparse, argType),
         row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
         col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
-  SparseMatrixArg(const GpuSparseMatrix& sparse)
-      : BufferArg(sparse),
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+      : BufferArg(sparse, argType),
         row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
         col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 1a483c47953b12b2b2621bb290236f93cbce6f94..b50098c52123a84830e14486cbc82ea3e4a7ba94 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -84,12 +84,9 @@ public:
     begin_pad_ = config.get<size_t>("begin_pad");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), 2);
@@ -103,6 +100,7 @@ public:
     /// input and output has the same batch_size
     CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
 
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     auto out_mat = outputs[0].matrix<Device>();
     auto in_mat = inputs[0].matrix<Device>();
     auto w_mat = !inputs[1].data()
@@ -194,12 +192,9 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK(outputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), 2);
@@ -214,6 +209,8 @@ public:
     /// dim of output = dim of input * context_length
     CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
 
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
     auto out_grad_mat = outputs[0].matrix<Device>();
     auto in_grad_mat =
         !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index ec27db9c212965a7839d88691b935c29e3077a0b..23ee357a53d0d79f0ef17a08c65c939e9e369d33 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,6 +112,8 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
 }
 
 /**
+ * \brief {o_0, o_1} = calc(i_0)
+ *
  * \param inputs[0] input value.
  * \param outputs[0] output value.
  * \param outputs[1] denoms.
@@ -125,17 +127,16 @@ public:
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1, inputs.size());
     CHECK_EQ(2, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK_EQ(inputs[0].shape().ndims(), 4);
     CHECK(inputs[0].shape() == outputs[0].shape());
     CHECK(inputs[0].shape() == outputs[1].shape());
 
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
     size_t height = inputs[0].shape()[2];
@@ -160,6 +161,8 @@ private:
 };
 
 /**
+ * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ *
  * \param inputs[0] input value.
  * \param inputs[1] output value.
  * \param inputs[2] output grad.
@@ -175,12 +178,9 @@ public:
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(4, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK_EQ(inputs[0].shape().ndims(), 4);
     CHECK(inputs[0].shape() == inputs[1].shape());
@@ -188,6 +188,9 @@ public:
     CHECK(inputs[0].shape() == inputs[3].shape());
     CHECK(inputs[0].shape() == outputs[0].shape());
 
+    // TODO(hedaoyuan): need support ASSIGN_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
     size_t height = inputs[0].shape()[2];
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 2f56cfc1b5492c23d596f4bfb5a7ae9f066bd10b..46af4e946258a5a956e957f38bfe06e43e7464dc 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -72,16 +72,18 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
   return *this;
 }
 
-void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape));
+void BufferArgs::addArg(const Matrix& arg,
+                        const TensorShape& shape,
+                        ArgType argType) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
 }
 
-void BufferArgs::addArg(const CpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
 }
 
-void BufferArgs::addArg(const GpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 88d6824aa393933f29eb62975627b80133a8783c..249f8f9cfad58bf596e8cdce9188409b5690f969 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -49,7 +49,7 @@ protected:
 /**
  * Argument type for Function::calc().
  * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs, outputs and inouts.
+ * because Function can have multiple inputs and outputs.
  */
 class BufferArgs {
 public:
@@ -58,9 +58,11 @@ public:
 
   // add argument into BufferArgs
   // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
   template <typename Tensor>
-  void addArg(const Tensor& arg) {
-    args_.push_back(std::make_shared<BufferArg>(arg));
+  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
+    args_.push_back(std::make_shared<BufferArg>(arg, argType));
   }
 
   // Add arg into BufferArgs and reshape the arg.
@@ -68,10 +70,12 @@ public:
   // For example, arg represents an image buffer,
   // but Matrix can only represent a two-dimensional Tensor.
   // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg, const TensorShape& shape);
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
 
-  void addArg(const CpuSparseMatrix& arg);
-  void addArg(const GpuSparseMatrix& arg);
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
 
   // get argument
   const BufferArg& operator[](size_t num) const {
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 26783a42cac42d79d5280641c2512e505adb5239..04d06cf33fed105d87ba0a828f053e6c9f826689 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -122,14 +122,13 @@ void ContextProjection::forward() {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
   inputs.addArg(*in_->value);
   inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
                           w_ptr ? w_ptr->getHeight() : 0,
                           input_dim));
   inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->value);
-  forward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*out_->value, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -166,15 +165,14 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
   inputs.addArg(CpuMatrix(
       in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
   inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
                           w_ptr ? w_ptr->getHeight() : 0,
                           input_dim));
   inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->grad);
-  backward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*out_->grad, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 573de152fd0d5eadb59450e34edfd066b85db600..4331009de7e98d2326049e563e46a55a20366507 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -59,7 +59,6 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
 
 void CMRProjectionNormLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   /* note: one sample correspond to one row */
   MatrixPtr input = inputLayers_[0]->getOutputValue();
@@ -67,42 +66,36 @@ void CMRProjectionNormLayer::forward(PassType passType) {
   int size = getSize();
   resetOutput(batchSize, size);
 
-  MatrixPtr outV = getOutputValue();
-
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
   shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
 
+  // prepare forward arguments
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
-  inputs.addArg(*input, shape_);
-  outputs.addArg(*outV, shape_);
-  outputs.addArg(*denoms_, shape_);
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
 
-  forward_[0]->calc(inputs, outputs, inouts);
+  forward_[0]->calc(inputs, outputs);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
-  if (NULL == inputLayers_[0]->getOutputGrad()) {
+  if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
-  MatrixPtr localGrad = getOutputGrad();
-  MatrixPtr localOutV = getOutputValue();
-  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
+  // prepare backward arguments
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
-  inputs.addArg(*preOutV, shape_);
-  inputs.addArg(*localOutV, shape_);
-  inputs.addArg(*localGrad, shape_);
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
   inputs.addArg(*denoms_, shape_);
-  outputs.addArg(*preOutGrad, shape_);
-  backward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle