From 0f91ea7ebbdfb50567ed5408562ca09eadec24ad Mon Sep 17 00:00:00 2001 From: Haonan Date: Wed, 7 Sep 2016 11:20:36 -0700 Subject: [PATCH] use HPPL_STREAM_DEFAULT for layer computation Change-Id: Id66da7b7f5bf9ec80cc19b347e4fb822a5a6f197 --- paddle/gserver/evaluators/CTCErrorEvaluator.cpp | 5 +++-- .../gradientmachines/MultiGradientMachine.cpp | 6 +++++- paddle/gserver/layers/CTCLayer.cpp | 9 +++++---- paddle/gserver/layers/ConvOperator.cpp | 2 +- paddle/gserver/layers/CostLayer.cpp | 5 +++-- paddle/gserver/layers/SamplingIdLayer.cpp | 5 +++-- paddle/parameter/Argument.cpp | 15 ++++++++++++--- paddle/parameter/Argument.h | 12 ++++++++++++ 8 files changed, 44 insertions(+), 15 deletions(-) diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp index a0c68fc9c2..cd4ed19c2c 100644 --- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp +++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp @@ -194,8 +194,9 @@ public: virtual real evalImp(std::vector& arguments) { CHECK_EQ(arguments.size(), (size_t)2); Argument output, label; - output.resizeAndCopyFrom(arguments[0], false); - label.resizeAndCopyFrom(arguments[1], false); + output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT); + label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); CHECK(label.sequenceStartPositions); CHECK(label.ids); size_t numSequences = label.sequenceStartPositions->getSize() - 1; diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp index 74a743145d..787ce703a0 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp @@ -878,7 +878,11 @@ void TrainerThread::copyOutputGrad() { outArgs_.resize(outputGradArgs.size()); for (size_t i = 0; i < outputGradArgs.size(); i++) { outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize, - multiMachine_->useGpu()); + multiMachine_->useGpu(), + HPPL_STREAM_DEFAULT); + } + if (multiMachine_->useGpu()) { + hl_stream_synchronize(HPPL_STREAM_DEFAULT); } gradientMachine_->setOutputGrad(outArgs_); } diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp index aa9c0d8a4b..6b9ffc5c74 100644 --- a/paddle/gserver/layers/CTCLayer.cpp +++ b/paddle/gserver/layers/CTCLayer.cpp @@ -49,9 +49,10 @@ void CTCLayer::forward(PassType passType) { Layer::forward(passType); if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } - hl_stream_synchronize(HPPL_STREAM_1); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]); } else { forwardImp(getInput(0), getInput(1)); @@ -93,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) { if (useGpu_) { backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]); const_cast(getInput(0)). - resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1); + resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT); const_cast(getInput(1)). - resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1); + resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT); } else { backwardImp(callback, getInput(0), getInput(1)); } diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp index d08c422764..8c72c17784 100644 --- a/paddle/gserver/layers/ConvOperator.cpp +++ b/paddle/gserver/layers/ConvOperator.cpp @@ -248,7 +248,7 @@ void ConvOperator::forward() { CHECK_EQ(ins_[1]->value->getHeight(), batchSize); checkFilterSize(ins_[1]->value); Matrix::resizeOrCreate(out_->value, batchSize, - outputH_ * outputW_ * numFilters_); + outputH_ * outputW_ * numFilters_, false, useGpu_); { AsyncGpuBlock block; for (size_t batchId = 0; batchId < batchSize; ++batchId) { diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index b778f3e9b0..0f99aee032 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -509,9 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, Matrix &cost) { if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } - hl_stream_synchronize(HPPL_STREAM_1); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); } forwardImpIn(output, label, cost); } diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp index cbc85b946f..b39c9948b5 100644 --- a/paddle/gserver/layers/SamplingIdLayer.cpp +++ b/paddle/gserver/layers/SamplingIdLayer.cpp @@ -52,9 +52,10 @@ public: Layer::forward(passType); if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } - hl_stream_synchronize(HPPL_STREAM_1); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); forwardImp(tmpCpuInput_[0]); } else { forwardImp(getInput(0)); diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 0e4d676c89..a81c72aacb 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -22,8 +22,12 @@ namespace paddle { static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, hl_stream_t stream) { if (src) { - Matrix::resizeOrCreate(dest, src->getHeight(), - src->getWidth(), false, useGpu); + if (!dest) { + dest = src->clone(0, 0, useGpu); + } else { + CHECK_EQ(dest->useGpu(), useGpu); + dest->resize(src->getHeight(), src->getWidth()); + } dest->copyFrom(*src, stream); } else { dest.reset(); @@ -59,7 +63,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, CHECK_LE((size_t)startRow + copySize, src->getHeight()); int height = copySize; int width = src->getWidth(); - Matrix::resizeOrCreate(dest, height, width, false, useGpu); + if (!dest) { + dest = src->clone(height, width, useGpu); + } else { + CHECK_EQ(dest->useGpu(), useGpu); + dest->resize(height, width); + } MatrixPtr submat = src->subMatrix(startRow, copySize); if (dynamic_cast(dest.get())) { // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix. diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 34ffeba7b5..3cab87c700 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -203,15 +203,27 @@ struct Argument { * startSeq: the sample id of start * copySize: how many samples need to copy * return value: how many samples are copied + * Note that when specifying the stream explicitly in this case, + * synchronize should also be called somewhere after this function */ int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t copySize, bool useGpu, hl_stream_t stream); + /* + * same with the above function, except that the stream is + * HPPL_STREAM_DEFAULT and synchronize is automatically called + * inside it + */ int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t copySize, bool useGpu = FLAGS_use_gpu); void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream); + /* + * same with the above function, except that the stream is + * HPPL_STREAM_DEFAULT and synchronize is automatically called + * inside it + */ void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu); /* -- GitLab