diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index 2b0c3f34648b0544063fe13f97ba071150b812ad..edd6ad3f739b6cefc24d235be55c7a8f541e1ab7 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
     encoded_vector = concat_layer(input=[src_forward, src_backward])
 
     with mixed_layer(size=decoder_size) as encoded_proj:
-        encoded_proj += full_matrix_projection(encoded_vector)
+        encoded_proj += full_matrix_projection(input=encoded_vector)
 
     backward_first = first_seq(input=src_backward)
     with mixed_layer(size=decoder_size,
                      act=TanhActivation(), ) as decoder_boot:
-        decoder_boot += full_matrix_projection(backward_first)
+        decoder_boot += full_matrix_projection(input=backward_first)
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         decoder_mem = memory(name='gru_decoder',
@@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf,
                                    decoder_state=decoder_mem, )
 
         with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += full_matrix_projection(context)
-            decoder_inputs += full_matrix_projection(current_word)
+            decoder_inputs += full_matrix_projection(input=context)
+            decoder_inputs += full_matrix_projection(input=current_word)
 
         gru_step = gru_step_layer(name='gru_decoder',
                                   input=decoder_inputs,
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 828c21beb2fbd4a3adc129f0b45e3d98933f87fd..46d86b2982f065802eec83ca7554f787d1d02f3a 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
  */
 extern void hl_sequence2batch_copy(real *batch,
                                    real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch);
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 417f40e0a69f6c023945b9de218e09eb0b87f04a..aabd956c37f7dce48a379b995ab88a53aa65c760 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
 
 inline void hl_sequence2batch_copy(real *batch,
                                    real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch) {}
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index e028880156e5b191d032253bc62739c9e5ab34fc..63824eaa4c201c50ea20521801cd12de685aa3b9 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
 __global__
 void KeSequence2Batch(real *batch,
                       real *sequence,
-                      int *batchIndex,
+                      const int *batchIndex,
                       int seqWidth,
                       int batchCount) {
   int idx = threadIdx.x;
@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
 
 void hl_sequence2batch_copy(real *batch,
                             real *sequence,
-                            int *batchIndex,
+                            const int *batchIndex,
                             int seqWidth,
                             int batchCount,
                             bool seq2batch) {
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 056e9568852ac93552413334be1960e9c17525d4..5e07446c71ff626684894cd99305ea8dc938d00d 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AgentLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
 
   // get Arguments from real layers
   if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows = realOutput.sequenceStartPositions->
-                  getData(false)[numSamples_];
+    int numRows =
+        realOutput.sequenceStartPositions->getData(false)[numSamples_];
     CHECK(!realOutput.ids) << "Not supported";
     output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
                        /* trans */ false, /* seqFlag */ true,
@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
 
   int width = this->getSize();
   if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
-                       width, useGpu_);
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
+                       useGpu_);
   } else {  // used in generation
     if (realLayer_->getOutput().ids) {
       IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
   if (realOutArg_.value || realOutArg_.ids) {
     CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
-                       width, useGpu_, /* trans */ false, /* seqFlag */ true,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
+                       useGpu_, /* trans */ false, /* seqFlag */ true,
                        /* seqStart */ seqStartPosIndex_,
                        /* seqSize */ numSequences_);
   } else {
@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
     CHECK_NE(input.sequenceStartPositions.get(),
              output_.sequenceStartPositions.get());
     ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences + 1, false);
+                                  numSequences + 1, false);
     int* outStarts = output_.sequenceStartPositions->getMutableData(false);
 
-    IVector::resizeOrCreate(cpuInputStartPos_, height, false);
-    int* inStarts = cpuInputStartPos_->getData();
+    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
+    int* inStarts = inputStartPos_->getMutableData(false);
+
     size_t offsetOut = 0;
     for (size_t i = 0; i < numSequences; ++i) {
       outStarts[i] = offsetOut;
@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
     }
     outStarts[numSequences] = offsetOut;
 
-    if (useGpu_) {
-      IVector::resizeOrCreate(inputStartPos_, height, true);
-      inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
-    } else {
-      inputStartPos_ = cpuInputStartPos_;
-    }
-    outputValue->copyByRowIndex(*input.value, *inputStartPos_);
+    outputValue->copyByRowIndex(*input.value,
+                                *inputStartPos_->getVector(useGpu_));
   }
 }
 
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index d82078dd93329459e7333850fec57dcf5fd915fc..3d7bf5583407078da4d66264e62581a59d5013ae 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
 protected:
   // use to store expanded cpuStartPositions or subSequenceStartPositions
   // of real layer.
-  IVectorPtr cpuInputStartPos_;
-
-  // point to cpuInputStartPos_ when useGpu_ is false
-  // copy from cpuInputStartPos_ when useGpu_ is true
-  IVectorPtr inputStartPos_;
+  ICpuGpuVectorPtr inputStartPos_;
 
 public:
   explicit SequenceScatterAgentLayer(const LayerConfig& config)
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp
index bbd0b53273b430101cc3a6c07d7882dffaa81e64..9290ce4f6d46c1237322549924ce1eb7754d2309 100644
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ExpandLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
   const Argument& shapeInput = getInput(1);
   const Argument& dataInput = getInput(0);
   size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions =
-      type_ ? shapeInput.subSequenceStartPositions
-            : shapeInput.sequenceStartPositions;
+  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
+                              : shapeInput.sequenceStartPositions;
   size_t numSequences = startPositions->getSize() - 1;
   const int* starts = startPositions->getData(false);
 
@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
   // set output sequence info as shape sequence
   output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
   if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions =
-        shapeInput.subSequenceStartPositions;
+    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
   }
 
   // reserve output: Expand output to batchsize of sequence data.
@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
-  IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
-  int* expandStarts = cpuExpandStartsPos_->getData();
+  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
+  int* expandStarts = expandStartsPos_->getMutableData(false);
   for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
     int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
     for (int j = 0; j < sequenceLength; j++) {
@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
     }
   }
 
-  if (useGpu_) {
-    // TODO(Dangqingqing) move copyFrom
-    IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
-    expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
-  } else {
-    expandStartsPos_ = cpuExpandStartsPos_;
-  }
-
-  outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
+  outputValue->copyByRowIndex(*inputValue,
+                              *expandStartsPos_->getVector(useGpu_));
 
   if (biases_.get() != NULL) {
     outputValue->addBias(*(biases_->getW()), 1);
@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
 void ExpandLayer::backward(const UpdateCallback& callback) {
   if (biases_ && biases_->getWGrad()) {
     biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-     /* Increasing the number of gradient */
+    /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
   if (!getInputGrad(0)) return;
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos =
-      type_ ? getInput(1).subSequenceStartPositions
-            : getInput(1).sequenceStartPositions;
+  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
+                              : getInput(1).sequenceStartPositions;
   size_t numSequences = cpuSeqStartPos->getSize() - 1;
   const int* starts = cpuSeqStartPos->getData(false);
 
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 8a3eb1c973a4751dc062734b99c116f15a23138b..fbe0ced9b1754d72874071575b33f552ccf93cc6 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -44,14 +44,9 @@ protected:
   enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
   /// store the ExpandLevel
   int type_;
-  // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
-  // and expandStartsPos_
   /// expanded sequenceStartPositions or subSequenceStartPositions
   /// of input[1]
-  IVectorPtr cpuExpandStartsPos_;
-  /// point to cpuExpandStartsPos_ when useGpu_ is false,
-  /// copy from cpuExpandStartsPos_ when useGpu_ is true
-  IVectorPtr expandStartsPos_;
+  ICpuGpuVectorPtr expandStartsPos_;
 
 public:
   explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index a6ff2f3b35d04783c718a7ce5c42cd586f3eadae..78519ce7aa8742192eb15e5c4705572a7df5dbdc 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -282,13 +282,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
   copyFrom(matrix);
 }
 
-void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
   size_t height = getHeight();
   size_t width = getWidth();
   CHECK_EQ(b.getWidth(), width);
   real* dst = getData();
   real* src = b.getData();
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
   hl_sequence2batch_copy(dst, src, index, width, height, true);
 }
 
@@ -1278,11 +1278,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
   }
 }
 
-void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
   size_t height = getHeight();
   size_t width = getWidth();
   CHECK_EQ(b.getWidth(), width);
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
   for (size_t i = 0; i < height; i++) {
     CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
     real* src = b.getData() + index[i] * width;
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 5c15c94012816eee6234298142c7c8baa53afae7..25104fe1c6d70afbf39ab47a17ce0bf21a121427 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -253,7 +253,7 @@ public:
     LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
   }
 
-  virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -979,7 +979,7 @@ public:
 
   void copyFrom(const IVector& src);
 
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
@@ -1241,7 +1241,7 @@ public:
 
   void copyFrom(CpuSparseMatrix& src);
 
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);