提交 688eeefa 编写于 作者: H Haonan 提交者: Yu Yang

fixed issues with synchronizing streams when copy from gpu to cpu

* by default, synchronize default_stream after resizeAndCopyFrom
* add sync in some places after resizeAndCopyFrom using other streams
上级 42a11791
...@@ -196,7 +196,6 @@ public: ...@@ -196,7 +196,6 @@ public:
Argument output, label; Argument output, label;
output.resizeAndCopyFrom(arguments[0], false); output.resizeAndCopyFrom(arguments[0], false);
label.resizeAndCopyFrom(arguments[1], false); label.resizeAndCopyFrom(arguments[1], false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
CHECK(label.sequenceStartPositions); CHECK(label.sequenceStartPositions);
CHECK(label.ids); CHECK(label.ids);
size_t numSequences = label.sequenceStartPositions->getSize() - 1; size_t numSequences = label.sequenceStartPositions->getSize() - 1;
......
...@@ -878,11 +878,7 @@ void TrainerThread::copyOutputGrad() { ...@@ -878,11 +878,7 @@ void TrainerThread::copyOutputGrad() {
outArgs_.resize(outputGradArgs.size()); outArgs_.resize(outputGradArgs.size());
for (size_t i = 0; i < outputGradArgs.size(); i++) { for (size_t i = 0; i < outputGradArgs.size(); i++) {
outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize, outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
multiMachine_->useGpu(), multiMachine_->useGpu());
HPPL_STREAM_DEFAULT);
}
if (multiMachine_->useGpu()) {
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
} }
gradientMachine_->setOutputGrad(outArgs_); gradientMachine_->setOutputGrad(outArgs_);
} }
......
...@@ -51,6 +51,7 @@ void CTCLayer::forward(PassType passType) { ...@@ -51,6 +51,7 @@ void CTCLayer::forward(PassType passType) {
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
} }
hl_stream_synchronize(HPPL_STREAM_1);
forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]); forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
} else { } else {
forwardImp(getInput(0), getInput(1)); forwardImp(getInput(0), getInput(1));
......
...@@ -511,6 +511,7 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, ...@@ -511,6 +511,7 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
} }
hl_stream_synchronize(HPPL_STREAM_1);
} }
forwardImpIn(output, label, cost); forwardImpIn(output, label, cost);
} }
......
...@@ -54,6 +54,7 @@ public: ...@@ -54,6 +54,7 @@ public:
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
} }
hl_stream_synchronize(HPPL_STREAM_1);
forwardImp(tmpCpuInput_[0]); forwardImp(tmpCpuInput_[0]);
} else { } else {
forwardImp(getInput(0)); forwardImp(getInput(0));
......
...@@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers, ...@@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
testLayer->forward(PASS_TEST); testLayer->forward(PASS_TEST);
Argument out; Argument out;
out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
if (batchOut.value) { if (batchOut.value) {
size_t dim = batchOut.value->getWidth(); size_t dim = batchOut.value->getWidth();
ASSERT_TRUE((bool)out.value); ASSERT_TRUE((bool)out.value);
...@@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers, ...@@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
testLayer->forward(PASS_TEST); testLayer->forward(PASS_TEST);
Argument out; Argument out;
out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false); out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
if (batchOut.value) { if (batchOut.value) {
size_t dim = batchOut.value->getWidth(); size_t dim = batchOut.value->getWidth();
ASSERT_TRUE((bool)out.value); ASSERT_TRUE((bool)out.value);
......
...@@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, ...@@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
Argument& cpuInput = testCpu.dataLayer_->getOutput(); Argument& cpuInput = testCpu.dataLayer_->getOutput();
Argument& gpuInput = testGpu.dataLayer_->getOutput(); Argument& gpuInput = testGpu.dataLayer_->getOutput();
gpuInput.resizeAndCopyFrom(cpuInput, true); gpuInput.resizeAndCopyFrom(cpuInput, true);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE); const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE); const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
......
...@@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width, ...@@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
if (!matrix) { if (!matrix) {
matrix = Matrix::create(height, width, trans, useGpu); matrix = Matrix::create(height, width, trans, useGpu);
} else { } else {
CHECK_EQ(matrix->useGpu(), useGpu);
matrix->resize(height, width); matrix->resize(height, width);
} }
} }
...@@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height, ...@@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
} else { } else {
CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) || CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
dynamic_cast<GpuSparseMatrix*>(matrix.get())); dynamic_cast<GpuSparseMatrix*>(matrix.get()));
CHECK_EQ(matrix->useGpu(), useGpu);
matrix->resize(height, width, nnz, valueType, format); matrix->resize(height, width, nnz, valueType, format);
} }
} }
......
...@@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) { ...@@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
} else if ((!useGpu) && (!cpuVectorT_)) { } else if ((!useGpu) && (!cpuVectorT_)) {
cpuVectorT_ = VectorT<T>::create(size, false); cpuVectorT_ = VectorT<T>::create(size, false);
} else { } else {
CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
this->resize(size, useGpu); this->resize(size, useGpu);
} }
} }
......
...@@ -22,11 +22,8 @@ namespace paddle { ...@@ -22,11 +22,8 @@ namespace paddle {
static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
hl_stream_t stream) { hl_stream_t stream) {
if (src) { if (src) {
if (!dest) { Matrix::resizeOrCreate(dest, src->getHeight(),
dest = src->clone(0, 0, useGpu); src->getWidth(), false, useGpu);
} else {
dest->resize(src->getHeight(), src->getWidth());
}
dest->copyFrom(*src, stream); dest->copyFrom(*src, stream);
} else { } else {
dest.reset(); dest.reset();
...@@ -60,14 +57,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, ...@@ -60,14 +57,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
hl_stream_t stream = HPPL_STREAM_DEFAULT) { hl_stream_t stream = HPPL_STREAM_DEFAULT) {
if (src) { if (src) {
CHECK_LE((size_t)startRow + copySize, src->getHeight()); CHECK_LE((size_t)startRow + copySize, src->getHeight());
int height = copySize; int height = copySize;
int width = src->getWidth(); int width = src->getWidth();
if (!dest) { Matrix::resizeOrCreate(dest, height, width, false, useGpu);
dest = src->clone(height, width, useGpu);
} else {
dest->resize(height, width);
}
MatrixPtr submat = src->subMatrix(startRow, copySize); MatrixPtr submat = src->subMatrix(startRow, copySize);
if (dynamic_cast<GpuSparseMatrix*>(dest.get())) { if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
// copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix. // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
...@@ -182,6 +174,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, ...@@ -182,6 +174,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
} }
} }
void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
}
void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
hl_stream_t stream) { hl_stream_t stream) {
dataId = src.dataId; dataId = src.dataId;
...@@ -199,6 +196,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, ...@@ -199,6 +196,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
resizeAndCopy(strs, src.strs, useGpu, stream); resizeAndCopy(strs, src.strs, useGpu, stream);
} }
int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu) {
int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
return size;
}
int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu, int32_t copySize, bool useGpu,
hl_stream_t stream) { hl_stream_t stream) {
......
...@@ -205,11 +205,14 @@ struct Argument { ...@@ -205,11 +205,14 @@ struct Argument {
* return value: how many samples are copied * return value: how many samples are copied
*/ */
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu = FLAGS_use_gpu, int32_t copySize, bool useGpu, hl_stream_t stream);
hl_stream_t stream = HPPL_STREAM_DEFAULT);
void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu, int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
hl_stream_t stream = HPPL_STREAM_DEFAULT); int32_t copySize, bool useGpu = FLAGS_use_gpu);
void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
/* /*
@brief Concatenate several arguments into one and put the result into it. @brief Concatenate several arguments into one and put the result into it.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册