diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index 2b0c3f34648b0544063fe13f97ba071150b812ad..edd6ad3f739b6cefc24d235be55c7a8f541e1ab7 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf, encoded_vector = concat_layer(input=[src_forward, src_backward]) with mixed_layer(size=decoder_size) as encoded_proj: - encoded_proj += full_matrix_projection(encoded_vector) + encoded_proj += full_matrix_projection(input=encoded_vector) backward_first = first_seq(input=src_backward) with mixed_layer(size=decoder_size, act=TanhActivation(), ) as decoder_boot: - decoder_boot += full_matrix_projection(backward_first) + decoder_boot += full_matrix_projection(input=backward_first) def gru_decoder_with_attention(enc_vec, enc_proj, current_word): decoder_mem = memory(name='gru_decoder', @@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf, decoder_state=decoder_mem, ) with mixed_layer(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += full_matrix_projection(context) - decoder_inputs += full_matrix_projection(current_word) + decoder_inputs += full_matrix_projection(input=context) + decoder_inputs += full_matrix_projection(input=current_word) gru_step = gru_step_layer(name='gru_decoder', input=decoder_inputs, diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h index 828c21beb2fbd4a3adc129f0b45e3d98933f87fd..46d86b2982f065802eec83ca7554f787d1d02f3a 100644 --- a/paddle/cuda/include/hl_sequence.h +++ b/paddle/cuda/include/hl_sequence.h @@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad, */ extern void hl_sequence2batch_copy(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount, bool seq2batch); diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h index 417f40e0a69f6c023945b9de218e09eb0b87f04a..aabd956c37f7dce48a379b995ab88a53aa65c760 100644 --- a/paddle/cuda/include/stub/hl_sequence_stub.h +++ b/paddle/cuda/include/stub/hl_sequence_stub.h @@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad, inline void hl_sequence2batch_copy(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount, bool seq2batch) {} diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index e028880156e5b191d032253bc62739c9e5ab34fc..63824eaa4c201c50ea20521801cd12de685aa3b9 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -374,7 +374,7 @@ template __global__ void KeSequence2Batch(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount) { int idx = threadIdx.x; @@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch, void hl_sequence2batch_copy(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount, bool seq2batch) { diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 056e9568852ac93552413334be1960e9c17525d4..5e07446c71ff626684894cd99305ea8dc938d00d 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "AgentLayer.h" #include "paddle/utils/Logging.h" @@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) { // get Arguments from real layers if (numSamples_ > 0 && numSamples_ < realNumSequences) { - int numRows = realOutput.sequenceStartPositions-> - getData(false)[numSamples_]; + int numRows = + realOutput.sequenceStartPositions->getData(false)[numSamples_]; CHECK(!realOutput.ids) << "Not supported"; output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_, /* trans */ false, /* seqFlag */ true, @@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) { int width = this->getSize(); if (realOutArg_.value || realOutArg_.ids) { - output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, - width, useGpu_); + output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width, + useGpu_); } else { // used in generation if (realLayer_->getOutput().ids) { IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_); @@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) { if (realOutArg_.value || realOutArg_.ids) { CHECK(realOutArg_.sequenceStartPositions); - output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, - width, useGpu_, /* trans */ false, /* seqFlag */ true, + output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width, + useGpu_, /* trans */ false, /* seqFlag */ true, /* seqStart */ seqStartPosIndex_, /* seqSize */ numSequences_); } else { @@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) { CHECK_NE(input.sequenceStartPositions.get(), output_.sequenceStartPositions.get()); ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions, - numSequences + 1, false); + numSequences + 1, false); int* outStarts = output_.sequenceStartPositions->getMutableData(false); - IVector::resizeOrCreate(cpuInputStartPos_, height, false); - int* inStarts = cpuInputStartPos_->getData(); + ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false); + int* inStarts = inputStartPos_->getMutableData(false); + size_t offsetOut = 0; for (size_t i = 0; i < numSequences; ++i) { outStarts[i] = offsetOut; @@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) { } outStarts[numSequences] = offsetOut; - if (useGpu_) { - IVector::resizeOrCreate(inputStartPos_, height, true); - inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT); - } else { - inputStartPos_ = cpuInputStartPos_; - } - outputValue->copyByRowIndex(*input.value, *inputStartPos_); + outputValue->copyByRowIndex(*input.value, + *inputStartPos_->getVector(useGpu_)); } } diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h index d82078dd93329459e7333850fec57dcf5fd915fc..3d7bf5583407078da4d66264e62581a59d5013ae 100644 --- a/paddle/gserver/layers/AgentLayer.h +++ b/paddle/gserver/layers/AgentLayer.h @@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer { protected: // use to store expanded cpuStartPositions or subSequenceStartPositions // of real layer. - IVectorPtr cpuInputStartPos_; - - // point to cpuInputStartPos_ when useGpu_ is false - // copy from cpuInputStartPos_ when useGpu_ is true - IVectorPtr inputStartPos_; + ICpuGpuVectorPtr inputStartPos_; public: explicit SequenceScatterAgentLayer(const LayerConfig& config) diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp index bbd0b53273b430101cc3a6c07d7882dffaa81e64..9290ce4f6d46c1237322549924ce1eb7754d2309 100644 --- a/paddle/gserver/layers/ExpandLayer.cpp +++ b/paddle/gserver/layers/ExpandLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ExpandLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" @@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) { const Argument& shapeInput = getInput(1); const Argument& dataInput = getInput(0); size_t outputBatchSize = shapeInput.getBatchSize(); - auto startPositions = - type_ ? shapeInput.subSequenceStartPositions - : shapeInput.sequenceStartPositions; + auto startPositions = type_ ? shapeInput.subSequenceStartPositions + : shapeInput.sequenceStartPositions; size_t numSequences = startPositions->getSize() - 1; const int* starts = startPositions->getData(false); @@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) { // set output sequence info as shape sequence output_.sequenceStartPositions = shapeInput.sequenceStartPositions; if (shapeInput.hasSubseq()) { - output_.subSequenceStartPositions = - shapeInput.subSequenceStartPositions; + output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions; } // reserve output: Expand output to batchsize of sequence data. @@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) { MatrixPtr inputValue = getInputValue(0); MatrixPtr outputValue = getOutputValue(); - IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false); - int* expandStarts = cpuExpandStartsPos_->getData(); + ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false); + int* expandStarts = expandStartsPos_->getMutableData(false); for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) { int sequenceLength = starts[sequenceId + 1] - starts[sequenceId]; for (int j = 0; j < sequenceLength; j++) { @@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) { } } - if (useGpu_) { - // TODO(Dangqingqing) move copyFrom - IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true); - expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT); - } else { - expandStartsPos_ = cpuExpandStartsPos_; - } - - outputValue->copyByRowIndex(*inputValue, *expandStartsPos_); + outputValue->copyByRowIndex(*inputValue, + *expandStartsPos_->getVector(useGpu_)); if (biases_.get() != NULL) { outputValue->addBias(*(biases_->getW()), 1); @@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) { void ExpandLayer::backward(const UpdateCallback& callback) { if (biases_ && biases_->getWGrad()) { biases_->getWGrad()->collectBias(*getOutputGrad(), 1); - /* Increasing the number of gradient */ + /* Increasing the number of gradient */ biases_->getParameterPtr()->incUpdate(callback); } if (!getInputGrad(0)) return; MatrixPtr inputGrad = getInputGrad(0); MatrixPtr outputGrad = getOutputGrad(); - auto cpuSeqStartPos = - type_ ? getInput(1).subSequenceStartPositions - : getInput(1).sequenceStartPositions; + auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions + : getInput(1).sequenceStartPositions; size_t numSequences = cpuSeqStartPos->getSize() - 1; const int* starts = cpuSeqStartPos->getData(false); diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h index 8a3eb1c973a4751dc062734b99c116f15a23138b..fbe0ced9b1754d72874071575b33f552ccf93cc6 100644 --- a/paddle/gserver/layers/ExpandLayer.h +++ b/paddle/gserver/layers/ExpandLayer.h @@ -44,14 +44,9 @@ protected: enum ExpandLevel { kNonSeq = 0, kSeq = 1 }; /// store the ExpandLevel int type_; - // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_ - // and expandStartsPos_ /// expanded sequenceStartPositions or subSequenceStartPositions /// of input[1] - IVectorPtr cpuExpandStartsPos_; - /// point to cpuExpandStartsPos_ when useGpu_ is false, - /// copy from cpuExpandStartsPos_ when useGpu_ is true - IVectorPtr expandStartsPos_; + ICpuGpuVectorPtr expandStartsPos_; public: explicit ExpandLayer(const LayerConfig& config) : Layer(config) {} diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index a6ff2f3b35d04783c718a7ce5c42cd586f3eadae..78519ce7aa8742192eb15e5c4705572a7df5dbdc 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -282,13 +282,13 @@ void GpuMatrix::copyFrom(const IVector& src) { copyFrom(matrix); } -void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) { +void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) { size_t height = getHeight(); size_t width = getWidth(); CHECK_EQ(b.getWidth(), width); real* dst = getData(); real* src = b.getData(); - int* index = rowIndex.getData(); + const int* index = rowIndex.getData(); hl_sequence2batch_copy(dst, src, index, width, height, true); } @@ -1278,11 +1278,11 @@ void CpuMatrix::copyFrom(const IVector& src) { } } -void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) { +void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) { size_t height = getHeight(); size_t width = getWidth(); CHECK_EQ(b.getWidth(), width); - int* index = rowIndex.getData(); + const int* index = rowIndex.getData(); for (size_t i = 0; i < height; i++) { CHECK_LT(static_cast(index[i]), b.getHeight()); real* src = b.getData() + index[i] * width; diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 5c15c94012816eee6234298142c7c8baa53afae7..25104fe1c6d70afbf39ab47a17ce0bf21a121427 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -253,7 +253,7 @@ public: LOG(FATAL) << "copy data from int vector only available on CpuMatrix."; } - virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) { + virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) { LOG(FATAL) << "Not implemented"; } @@ -979,7 +979,7 @@ public: void copyFrom(const IVector& src); - void copyByRowIndex(Matrix& b, IVector& rowIndex); + void copyByRowIndex(Matrix& b, const IVector& rowIndex); MatrixPtr clone(size_t height, size_t width, bool useGpu = false); @@ -1241,7 +1241,7 @@ public: void copyFrom(CpuSparseMatrix& src); - void copyByRowIndex(Matrix& b, IVector& rowIndex); + void copyByRowIndex(Matrix& b, const IVector& rowIndex); MatrixPtr clone(size_t height, size_t width, bool useGpu = false);