remove some copyfrom in AgentLayer and ExpandLayer, fix warning in seq2seq config (#183)

91df6062 · luotao1 · emailweixu · cebdb667 · 91df6062 · 91df6062
10 changed file
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
    encoded_vector = concat_layer(input=[src_forward, src_backward])
    with mixed_layer(size=decoder_size) as encoded_proj:
-        encoded_proj += full_matrix_projection(encoded_vector)
+        encoded_proj += full_matrix_projection(input=encoded_vector)
    backward_first = first_seq(input=src_backward)
    with mixed_layer(size=decoder_size,
                     act=TanhActivation(), ) as decoder_boot:
-        decoder_boot += full_matrix_projection(backward_first)
+        decoder_boot += full_matrix_projection(input=backward_first)
    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
        decoder_mem = memory(name='gru_decoder',
@@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf,
                                   decoder_state=decoder_mem, )
        with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += full_matrix_projection(context)
+            decoder_inputs += full_matrix_projection(input=context)
-            decoder_inputs += full_matrix_projection(current_word)
+            decoder_inputs += full_matrix_projection(input=current_word)
        gru_step = gru_step_layer(name='gru_decoder',
                                  input=decoder_inputs,

--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
 */
 extern void hl_sequence2batch_copy(real *batch,
                                   real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch);

--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
 inline void hl_sequence2batch_copy(real *batch,
                                   real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch) {}

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
 __global__
 void KeSequence2Batch(real *batch,
                      real *sequence,
-                      int *batchIndex,
+                      const int *batchIndex,
                      int seqWidth,
                      int batchCount) {
  int idx = threadIdx.x;
@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
 void hl_sequence2batch_copy(real *batch,
                            real *sequence,
-                            int *batchIndex,
+                            const int *batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "AgentLayer.h"
 #include "paddle/utils/Logging.h"
@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
  // get Arguments from real layers
  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows = realOutput.sequenceStartPositions->
+    int numRows =
-                  getData(false)[numSamples_];
+        realOutput.sequenceStartPositions->getData(false)[numSamples_];
    CHECK(!realOutput.ids) << "Not supported";
    output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
                       /* trans */ false, /* seqFlag */ true,
@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
  int width = this->getSize();
  if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       width, useGpu_);
+                       useGpu_);
  } else {  // used in generation
    if (realLayer_->getOutput().ids) {
      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
  if (realOutArg_.value || realOutArg_.ids) {
    CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       width, useGpu_, /* trans */ false, /* seqFlag */ true,
+                       useGpu_, /* trans */ false, /* seqFlag */ true,
                       /* seqStart */ seqStartPosIndex_,
                       /* seqSize */ numSequences_);
  } else {
@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
    CHECK_NE(input.sequenceStartPositions.get(),
             output_.sequenceStartPositions.get());
    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences + 1, false);
+                                  numSequences + 1, false);
    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
-    IVector::resizeOrCreate(cpuInputStartPos_, height, false);
+    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
-    int* inStarts = cpuInputStartPos_->getData();
+    int* inStarts = inputStartPos_->getMutableData(false);
    size_t offsetOut = 0;
    for (size_t i = 0; i < numSequences; ++i) {
      outStarts[i] = offsetOut;
@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
    }
    outStarts[numSequences] = offsetOut;
-    if (useGpu_) {
+    outputValue->copyByRowIndex(*input.value,
-      IVector::resizeOrCreate(inputStartPos_, height, true);
+                                *inputStartPos_->getVector(useGpu_));
-      inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
-    } else {
-      inputStartPos_ = cpuInputStartPos_;
-    }
-    outputValue->copyByRowIndex(*input.value, *inputStartPos_);
  }
 }

--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
 protected:
  // use to store expanded cpuStartPositions or subSequenceStartPositions
  // of real layer.
-  IVectorPtr cpuInputStartPos_;
+  ICpuGpuVectorPtr inputStartPos_;
-  // point to cpuInputStartPos_ when useGpu_ is false
-  // copy from cpuInputStartPos_ when useGpu_ is true
-  IVectorPtr inputStartPos_;
 public:
  explicit SequenceScatterAgentLayer(const LayerConfig& config)

--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "ExpandLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
  const Argument& shapeInput = getInput(1);
  const Argument& dataInput = getInput(0);
  size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions =
+  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
-      type_ ? shapeInput.subSequenceStartPositions
+                              : shapeInput.sequenceStartPositions;
-            : shapeInput.sequenceStartPositions;
  size_t numSequences = startPositions->getSize() - 1;
  const int* starts = startPositions->getData(false);
@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
  // set output sequence info as shape sequence
  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
  if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions =
+    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
-        shapeInput.subSequenceStartPositions;
  }
  // reserve output: Expand output to batchsize of sequence data.
@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();
-  IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
+  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
-  int* expandStarts = cpuExpandStartsPos_->getData();
+  int* expandStarts = expandStartsPos_->getMutableData(false);
  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
    for (int j = 0; j < sequenceLength; j++) {
@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
    }
  }
-  if (useGpu_) {
+  outputValue->copyByRowIndex(*inputValue,
-    // TODO(Dangqingqing) move copyFrom
+                              *expandStartsPos_->getVector(useGpu_));
-    IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
-    expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
-  } else {
-    expandStartsPos_ = cpuExpandStartsPos_;
-  }
-  outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
  if (biases_.get() != NULL) {
    outputValue->addBias(*(biases_->getW()), 1);
@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
 void ExpandLayer::backward(const UpdateCallback& callback) {
  if (biases_ && biases_->getWGrad()) {
    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-     /* Increasing the number of gradient */
+    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);
  }
  if (!getInputGrad(0)) return;
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos =
+  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
-      type_ ? getInput(1).subSequenceStartPositions
+                              : getInput(1).sequenceStartPositions;
-            : getInput(1).sequenceStartPositions;
  size_t numSequences = cpuSeqStartPos->getSize() - 1;
  const int* starts = cpuSeqStartPos->getData(false);

--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -44,14 +44,9 @@ protected:
  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
  /// store the ExpandLevel
  int type_;
-  // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
-  // and expandStartsPos_
  /// expanded sequenceStartPositions or subSequenceStartPositions
  /// of input[1]
-  IVectorPtr cpuExpandStartsPos_;
+  ICpuGpuVectorPtr expandStartsPos_;
-  /// point to cpuExpandStartsPos_ when useGpu_ is false,
-  /// copy from cpuExpandStartsPos_ when useGpu_ is true
-  IVectorPtr expandStartsPos_;
 public:
  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -282,13 +282,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
  copyFrom(matrix);
 }
-void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
  size_t height = getHeight();
  size_t width = getWidth();
  CHECK_EQ(b.getWidth(), width);
  real* dst = getData();
  real* src = b.getData();
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
  hl_sequence2batch_copy(dst, src, index, width, height, true);
 }
@@ -1278,11 +1278,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
  }
 }
-void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
  size_t height = getHeight();
  size_t width = getWidth();
  CHECK_EQ(b.getWidth(), width);
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
  for (size_t i = 0; i < height; i++) {
    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
    real* src = b.getData() + index[i] * width;

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -253,7 +253,7 @@ public:
    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
  }
-  virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
    LOG(FATAL) << "Not implemented";
  }
@@ -979,7 +979,7 @@ public:
  void copyFrom(const IVector& src);
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
@@ -1241,7 +1241,7 @@ public:
  void copyFrom(CpuSparseMatrix& src);
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);