diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp index d0b1c0447d23d3e7072b2ee4f8e860708eb44bb2..a0c68fc9c29c4d4c40238be003f2fc86d371acc2 100644 --- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp +++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp @@ -196,7 +196,6 @@ public: Argument output, label; output.resizeAndCopyFrom(arguments[0], false); label.resizeAndCopyFrom(arguments[1], false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); CHECK(label.sequenceStartPositions); CHECK(label.ids); size_t numSequences = label.sequenceStartPositions->getSize() - 1; diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp index 787ce703a08aef602ac9603dbd7d48b807c7c6d5..74a743145da8bdf332ad5a2af2e80fc685bec69a 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp @@ -878,11 +878,7 @@ void TrainerThread::copyOutputGrad() { outArgs_.resize(outputGradArgs.size()); for (size_t i = 0; i < outputGradArgs.size(); i++) { outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize, - multiMachine_->useGpu(), - HPPL_STREAM_DEFAULT); - } - if (multiMachine_->useGpu()) { - hl_stream_synchronize(HPPL_STREAM_DEFAULT); + multiMachine_->useGpu()); } gradientMachine_->setOutputGrad(outArgs_); } diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp index db1450694ecf7608fb37790e841b967288378e1f..aa9c0d8a4b68fc6884257f2ad19715de38b29d08 100644 --- a/paddle/gserver/layers/CTCLayer.cpp +++ b/paddle/gserver/layers/CTCLayer.cpp @@ -51,6 +51,7 @@ void CTCLayer::forward(PassType passType) { for (size_t i = 0; i < inputLayers_.size(); i++) { tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); } + hl_stream_synchronize(HPPL_STREAM_1); forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]); } else { forwardImp(getInput(0), getInput(1)); diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index f353afabb3b7162783fef4f9093630fb826c86cb..b778f3e9b064fd1b327c272b46f26a8724add3b6 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -511,6 +511,7 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, for (size_t i = 0; i < inputLayers_.size(); i++) { tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); } + hl_stream_synchronize(HPPL_STREAM_1); } forwardImpIn(output, label, cost); } diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp index 41c1461967ae1c0ff3c4b3a11e8f7405b58f6ab9..cbc85b946f07955a843df02cac087b564dd91da3 100644 --- a/paddle/gserver/layers/SamplingIdLayer.cpp +++ b/paddle/gserver/layers/SamplingIdLayer.cpp @@ -54,6 +54,7 @@ public: for (size_t i = 0; i < inputLayers_.size(); i++) { tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); } + hl_stream_synchronize(HPPL_STREAM_1); forwardImp(tmpCpuInput_[0]); } else { forwardImp(getInput(0)); diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index f72011ae16cb3bac73e8acd5338bd7a179da329b..552a6c5b41c7f896c52b2132578b136200967573 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector& dataLayers, testLayer->forward(PASS_TEST); Argument out; out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); if (batchOut.value) { size_t dim = batchOut.value->getWidth(); ASSERT_TRUE((bool)out.value); @@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector& dataLayers, testLayer->forward(PASS_TEST); Argument out; out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); if (batchOut.value) { size_t dim = batchOut.value->getWidth(); ASSERT_TRUE((bool)out.value); diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 2cea190b859496cd635fc5a8d1834779537d50e6..9b933b153d158bef565c0964232525ba99b8b3d4 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, Argument& cpuInput = testCpu.dataLayer_->getOutput(); Argument& gpuInput = testGpu.dataLayer_->getOutput(); gpuInput.resizeAndCopyFrom(cpuInput, true); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE); const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE); diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index f3a6503d4a21ff8766f3289f8eee992d4d13045d..1b7f9ac5dac16c167dcc22930c28bc3521162b9b 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width, if (!matrix) { matrix = Matrix::create(height, width, trans, useGpu); } else { + CHECK_EQ(matrix->useGpu(), useGpu); matrix->resize(height, width); } } @@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height, } else { CHECK(dynamic_cast(matrix.get()) || dynamic_cast(matrix.get())); + CHECK_EQ(matrix->useGpu(), useGpu); matrix->resize(height, width, nnz, valueType, format); } } diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp index b1a459b86aa4ff70e4e07267c8a902123f9d17c0..7553ea25e09d2f52f1f8b9205f954510b77cbfa9 100644 --- a/paddle/math/Vector.cpp +++ b/paddle/math/Vector.cpp @@ -800,6 +800,7 @@ void CpuGpuVectorT::resizeOrCreate(size_t size, bool useGpu) { } else if ((!useGpu) && (!cpuVectorT_)) { cpuVectorT_ = VectorT::create(size, false); } else { + CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_)); this->resize(size, useGpu); } } diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 8610a66452358e1b2e2a846ddfcf62a0ce99e22e..0e4d676c899aeabf82f6f1b8a56a3b34e7523c45 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -22,11 +22,8 @@ namespace paddle { static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, hl_stream_t stream) { if (src) { - if (!dest) { - dest = src->clone(0, 0, useGpu); - } else { - dest->resize(src->getHeight(), src->getWidth()); - } + Matrix::resizeOrCreate(dest, src->getHeight(), + src->getWidth(), false, useGpu); dest->copyFrom(*src, stream); } else { dest.reset(); @@ -60,14 +57,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, hl_stream_t stream = HPPL_STREAM_DEFAULT) { if (src) { CHECK_LE((size_t)startRow + copySize, src->getHeight()); - int height = copySize; int width = src->getWidth(); - if (!dest) { - dest = src->clone(height, width, useGpu); - } else { - dest->resize(height, width); - } + Matrix::resizeOrCreate(dest, height, width, false, useGpu); MatrixPtr submat = src->subMatrix(startRow, copySize); if (dynamic_cast(dest.get())) { // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix. @@ -182,6 +174,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, } } +void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) { + resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); +} + void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream) { dataId = src.dataId; @@ -199,6 +196,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, resizeAndCopy(strs, src.strs, useGpu, stream); } +int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, + int32_t copySize, bool useGpu) { + int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu, + HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); + return size; +} + int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t copySize, bool useGpu, hl_stream_t stream) { diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index c444ebaf12930e938a3a4d75541d0fbf5bbb01ac..34ffeba7b5307009a0215ed6f1dfb468b9173057 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -205,11 +205,14 @@ struct Argument { * return value: how many samples are copied */ int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, - int32_t copySize, bool useGpu = FLAGS_use_gpu, - hl_stream_t stream = HPPL_STREAM_DEFAULT); + int32_t copySize, bool useGpu, hl_stream_t stream); - void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu, - hl_stream_t stream = HPPL_STREAM_DEFAULT); + int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, + int32_t copySize, bool useGpu = FLAGS_use_gpu); + + void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream); + + void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu); /* @brief Concatenate several arguments into one and put the result into it.