diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp index a0c68fc9c29c4d4c40238be003f2fc86d371acc2..cd4ed19c2ca45c310032c834da4cad56fb1cbdff 100644 --- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp +++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp @@ -194,8 +194,9 @@ public: virtual real evalImp(std::vector& arguments) { CHECK_EQ(arguments.size(), (size_t)2); Argument output, label; - output.resizeAndCopyFrom(arguments[0], false); - label.resizeAndCopyFrom(arguments[1], false); + output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT); + label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); CHECK(label.sequenceStartPositions); CHECK(label.ids); size_t numSequences = label.sequenceStartPositions->getSize() - 1; diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp index 74a743145da8bdf332ad5a2af2e80fc685bec69a..787ce703a08aef602ac9603dbd7d48b807c7c6d5 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp @@ -878,7 +878,11 @@ void TrainerThread::copyOutputGrad() { outArgs_.resize(outputGradArgs.size()); for (size_t i = 0; i < outputGradArgs.size(); i++) { outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize, - multiMachine_->useGpu()); + multiMachine_->useGpu(), + HPPL_STREAM_DEFAULT); + } + if (multiMachine_->useGpu()) { + hl_stream_synchronize(HPPL_STREAM_DEFAULT); } gradientMachine_->setOutputGrad(outArgs_); } diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp index aa9c0d8a4b68fc6884257f2ad19715de38b29d08..6b9ffc5c749fb45be567881b8e625b48e28f69b4 100644 --- a/paddle/gserver/layers/CTCLayer.cpp +++ b/paddle/gserver/layers/CTCLayer.cpp @@ -49,9 +49,10 @@ void CTCLayer::forward(PassType passType) { Layer::forward(passType); if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } - hl_stream_synchronize(HPPL_STREAM_1); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]); } else { forwardImp(getInput(0), getInput(1)); @@ -93,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) { if (useGpu_) { backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]); const_cast(getInput(0)). - resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1); + resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT); const_cast(getInput(1)). - resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1); + resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT); } else { backwardImp(callback, getInput(0), getInput(1)); } diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp index d08c422764e5642816a94fc55b5b67445ffb42f7..8c72c1778451dfddbaa740921cd08cf73fe56785 100644 --- a/paddle/gserver/layers/ConvOperator.cpp +++ b/paddle/gserver/layers/ConvOperator.cpp @@ -248,7 +248,7 @@ void ConvOperator::forward() { CHECK_EQ(ins_[1]->value->getHeight(), batchSize); checkFilterSize(ins_[1]->value); Matrix::resizeOrCreate(out_->value, batchSize, - outputH_ * outputW_ * numFilters_); + outputH_ * outputW_ * numFilters_, false, useGpu_); { AsyncGpuBlock block; for (size_t batchId = 0; batchId < batchSize; ++batchId) { diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index b778f3e9b064fd1b327c272b46f26a8724add3b6..0f99aee03200c3834c7c27343f41f77edc5a558e 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -509,9 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, Matrix &cost) { if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } - hl_stream_synchronize(HPPL_STREAM_1); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); } forwardImpIn(output, label, cost); } diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp index cbc85b946f07955a843df02cac087b564dd91da3..b39c9948b53118b51090059fc554e76f94316f81 100644 --- a/paddle/gserver/layers/SamplingIdLayer.cpp +++ b/paddle/gserver/layers/SamplingIdLayer.cpp @@ -52,9 +52,10 @@ public: Layer::forward(passType); if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { - tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); + tmpCpuInput_[i].resizeAndCopyFrom( + getInput(i), false, HPPL_STREAM_DEFAULT); } - hl_stream_synchronize(HPPL_STREAM_1); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); forwardImp(tmpCpuInput_[0]); } else { forwardImp(getInput(0)); diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 0e4d676c899aeabf82f6f1b8a56a3b34e7523c45..a81c72aacbed3f4077afd5d604d16c922cf07013 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -22,8 +22,12 @@ namespace paddle { static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, hl_stream_t stream) { if (src) { - Matrix::resizeOrCreate(dest, src->getHeight(), - src->getWidth(), false, useGpu); + if (!dest) { + dest = src->clone(0, 0, useGpu); + } else { + CHECK_EQ(dest->useGpu(), useGpu); + dest->resize(src->getHeight(), src->getWidth()); + } dest->copyFrom(*src, stream); } else { dest.reset(); @@ -59,7 +63,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, CHECK_LE((size_t)startRow + copySize, src->getHeight()); int height = copySize; int width = src->getWidth(); - Matrix::resizeOrCreate(dest, height, width, false, useGpu); + if (!dest) { + dest = src->clone(height, width, useGpu); + } else { + CHECK_EQ(dest->useGpu(), useGpu); + dest->resize(height, width); + } MatrixPtr submat = src->subMatrix(startRow, copySize); if (dynamic_cast(dest.get())) { // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix. diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 34ffeba7b5307009a0215ed6f1dfb468b9173057..3cab87c700225db0c3e54e60aa8376db14bc2fb1 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -203,15 +203,27 @@ struct Argument { * startSeq: the sample id of start * copySize: how many samples need to copy * return value: how many samples are copied + * Note that when specifying the stream explicitly in this case, + * synchronize should also be called somewhere after this function */ int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t copySize, bool useGpu, hl_stream_t stream); + /* + * same with the above function, except that the stream is + * HPPL_STREAM_DEFAULT and synchronize is automatically called + * inside it + */ int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t copySize, bool useGpu = FLAGS_use_gpu); void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream); + /* + * same with the above function, except that the stream is + * HPPL_STREAM_DEFAULT and synchronize is automatically called + * inside it + */ void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu); /*