From 688eeefab3f23bc709c367bb51684b1a36b49f65 Mon Sep 17 00:00:00 2001 From: Haonan Date: Tue, 6 Sep 2016 16:31:30 -0700 Subject: [PATCH] fixed issues with synchronizing streams when copy from gpu to cpu * by default, synchronize default_stream after resizeAndCopyFrom * add sync in some places after resizeAndCopyFrom using other streams --- .../gserver/evaluators/CTCErrorEvaluator.cpp | 1 - .../gradientmachines/MultiGradientMachine.cpp | 6 +---- paddle/gserver/layers/CTCLayer.cpp | 1 + paddle/gserver/layers/CostLayer.cpp | 1 + paddle/gserver/layers/SamplingIdLayer.cpp | 1 + paddle/gserver/tests/LayerGradUtil.cpp | 2 -- paddle/gserver/tests/test_RecurrentLayer.cpp | 1 - paddle/math/Matrix.cpp | 2 ++ paddle/math/Vector.cpp | 1 + paddle/parameter/Argument.cpp | 27 +++++++++++-------- paddle/parameter/Argument.h | 11 +++++--- 11 files changed, 30 insertions(+), 24 deletions(-) diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp index d0b1c0447d2..a0c68fc9c29 100644 --- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp +++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp @@ -196,7 +196,6 @@ public: Argument output, label; output.resizeAndCopyFrom(arguments[0], false); label.resizeAndCopyFrom(arguments[1], false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); CHECK(label.sequenceStartPositions); CHECK(label.ids); size_t numSequences = label.sequenceStartPositions->getSize() - 1; diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp index 787ce703a08..74a743145da 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp @@ -878,11 +878,7 @@ void TrainerThread::copyOutputGrad() { outArgs_.resize(outputGradArgs.size()); for (size_t i = 0; i < outputGradArgs.size(); i++) { outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize, - multiMachine_->useGpu(), - HPPL_STREAM_DEFAULT); - } - if (multiMachine_->useGpu()) { - hl_stream_synchronize(HPPL_STREAM_DEFAULT); + multiMachine_->useGpu()); } gradientMachine_->setOutputGrad(outArgs_); } diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp index db1450694ec..aa9c0d8a4b6 100644 --- a/paddle/gserver/layers/CTCLayer.cpp +++ b/paddle/gserver/layers/CTCLayer.cpp @@ -51,6 +51,7 @@ void CTCLayer::forward(PassType passType) { for (size_t i = 0; i < inputLayers_.size(); i++) { tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); } + hl_stream_synchronize(HPPL_STREAM_1); forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]); } else { forwardImp(getInput(0), getInput(1)); diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index f353afabb3b..b778f3e9b06 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -511,6 +511,7 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, for (size_t i = 0; i < inputLayers_.size(); i++) { tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); } + hl_stream_synchronize(HPPL_STREAM_1); } forwardImpIn(output, label, cost); } diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp index 41c1461967a..cbc85b946f0 100644 --- a/paddle/gserver/layers/SamplingIdLayer.cpp +++ b/paddle/gserver/layers/SamplingIdLayer.cpp @@ -54,6 +54,7 @@ public: for (size_t i = 0; i < inputLayers_.size(); i++) { tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); } + hl_stream_synchronize(HPPL_STREAM_1); forwardImp(tmpCpuInput_[0]); } else { forwardImp(getInput(0)); diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index f72011ae16c..552a6c5b41c 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector& dataLayers, testLayer->forward(PASS_TEST); Argument out; out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); if (batchOut.value) { size_t dim = batchOut.value->getWidth(); ASSERT_TRUE((bool)out.value); @@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector& dataLayers, testLayer->forward(PASS_TEST); Argument out; out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); if (batchOut.value) { size_t dim = batchOut.value->getWidth(); ASSERT_TRUE((bool)out.value); diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 2cea190b859..9b933b153d1 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, Argument& cpuInput = testCpu.dataLayer_->getOutput(); Argument& gpuInput = testGpu.dataLayer_->getOutput(); gpuInput.resizeAndCopyFrom(cpuInput, true); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE); const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index f3a6503d4a2..1b7f9ac5dac 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width, if (!matrix) { matrix = Matrix::create(height, width, trans, useGpu); } else { + CHECK_EQ(matrix->useGpu(), useGpu); matrix->resize(height, width); } } @@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height, } else { CHECK(dynamic_cast(matrix.get()) || dynamic_cast(matrix.get())); + CHECK_EQ(matrix->useGpu(), useGpu); matrix->resize(height, width, nnz, valueType, format); } } diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp index b1a459b86aa..7553ea25e09 100644 --- a/paddle/math/Vector.cpp +++ b/paddle/math/Vector.cpp @@ -800,6 +800,7 @@ void CpuGpuVectorT::resizeOrCreate(size_t size, bool useGpu) { } else if ((!useGpu) && (!cpuVectorT_)) { cpuVectorT_ = VectorT::create(size, false); } else { + CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_)); this->resize(size, useGpu); } } diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 8610a664523..0e4d676c899 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -22,11 +22,8 @@ namespace paddle { static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, hl_stream_t stream) { if (src) { - if (!dest) { - dest = src->clone(0, 0, useGpu); - } else { - dest->resize(src->getHeight(), src->getWidth()); - } + Matrix::resizeOrCreate(dest, src->getHeight(), + src->getWidth(), false, useGpu); dest->copyFrom(*src, stream); } else { dest.reset(); @@ -60,14 +57,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, hl_stream_t stream = HPPL_STREAM_DEFAULT) { if (src) { CHECK_LE((size_t)startRow + copySize, src->getHeight()); - int height = copySize; int width = src->getWidth(); - if (!dest) { - dest = src->clone(height, width, useGpu); - } else { - dest->resize(height, width); - } + Matrix::resizeOrCreate(dest, height, width, false, useGpu); MatrixPtr submat = src->subMatrix(startRow, copySize); if (dynamic_cast(dest.get())) { // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix. @@ -182,6 +174,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, } } +void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) { + resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); +} + void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream) { dataId = src.dataId; @@ -199,6 +196,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, resizeAndCopy(strs, src.strs, useGpu, stream); } +int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, + int32_t copySize, bool useGpu) { + int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu, + HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); + return size; +} + int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t copySize, bool useGpu, hl_stream_t stream) { diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index c444ebaf129..34ffeba7b53 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -205,11 +205,14 @@ struct Argument { * return value: how many samples are copied */ int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, - int32_t copySize, bool useGpu = FLAGS_use_gpu, - hl_stream_t stream = HPPL_STREAM_DEFAULT); + int32_t copySize, bool useGpu, hl_stream_t stream); - void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu, - hl_stream_t stream = HPPL_STREAM_DEFAULT); + int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, + int32_t copySize, bool useGpu = FLAGS_use_gpu); + + void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream); + + void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu); /* @brief Concatenate several arguments into one and put the result into it. -- GitLab