提交 688eeefa 编写于 作者: H Haonan 提交者: Yu Yang

fixed issues with synchronizing streams when copy from gpu to cpu

* by default, synchronize default_stream after resizeAndCopyFrom
* add sync in some places after resizeAndCopyFrom using other streams
上级 42a11791
......@@ -196,7 +196,6 @@ public:
Argument output, label;
output.resizeAndCopyFrom(arguments[0], false);
label.resizeAndCopyFrom(arguments[1], false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
CHECK(label.sequenceStartPositions);
CHECK(label.ids);
size_t numSequences = label.sequenceStartPositions->getSize() - 1;
......
......@@ -878,11 +878,7 @@ void TrainerThread::copyOutputGrad() {
outArgs_.resize(outputGradArgs.size());
for (size_t i = 0; i < outputGradArgs.size(); i++) {
outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
multiMachine_->useGpu(),
HPPL_STREAM_DEFAULT);
}
if (multiMachine_->useGpu()) {
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
multiMachine_->useGpu());
}
gradientMachine_->setOutputGrad(outArgs_);
}
......
......@@ -51,6 +51,7 @@ void CTCLayer::forward(PassType passType) {
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
} else {
forwardImp(getInput(0), getInput(1));
......
......@@ -511,6 +511,7 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
}
forwardImpIn(output, label, cost);
}
......
......@@ -54,6 +54,7 @@ public:
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
forwardImp(tmpCpuInput_[0]);
} else {
forwardImp(getInput(0));
......
......@@ -92,7 +92,6 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
testLayer->forward(PASS_TEST);
Argument out;
out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
if (batchOut.value) {
size_t dim = batchOut.value->getWidth();
ASSERT_TRUE((bool)out.value);
......@@ -220,7 +219,6 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
testLayer->forward(PASS_TEST);
Argument out;
out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
if (batchOut.value) {
size_t dim = batchOut.value->getWidth();
ASSERT_TRUE((bool)out.value);
......
......@@ -299,7 +299,6 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
Argument& cpuInput = testCpu.dataLayer_->getOutput();
Argument& gpuInput = testGpu.dataLayer_->getOutput();
gpuInput.resizeAndCopyFrom(cpuInput, true);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
......
......@@ -146,6 +146,7 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
if (!matrix) {
matrix = Matrix::create(height, width, trans, useGpu);
} else {
CHECK_EQ(matrix->useGpu(), useGpu);
matrix->resize(height, width);
}
}
......@@ -161,6 +162,7 @@ void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
} else {
CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
dynamic_cast<GpuSparseMatrix*>(matrix.get()));
CHECK_EQ(matrix->useGpu(), useGpu);
matrix->resize(height, width, nnz, valueType, format);
}
}
......
......@@ -800,6 +800,7 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
} else if ((!useGpu) && (!cpuVectorT_)) {
cpuVectorT_ = VectorT<T>::create(size, false);
} else {
CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
this->resize(size, useGpu);
}
}
......
......@@ -22,11 +22,8 @@ namespace paddle {
static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
hl_stream_t stream) {
if (src) {
if (!dest) {
dest = src->clone(0, 0, useGpu);
} else {
dest->resize(src->getHeight(), src->getWidth());
}
Matrix::resizeOrCreate(dest, src->getHeight(),
src->getWidth(), false, useGpu);
dest->copyFrom(*src, stream);
} else {
dest.reset();
......@@ -60,14 +57,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
hl_stream_t stream = HPPL_STREAM_DEFAULT) {
if (src) {
CHECK_LE((size_t)startRow + copySize, src->getHeight());
int height = copySize;
int width = src->getWidth();
if (!dest) {
dest = src->clone(height, width, useGpu);
} else {
dest->resize(height, width);
}
Matrix::resizeOrCreate(dest, height, width, false, useGpu);
MatrixPtr submat = src->subMatrix(startRow, copySize);
if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
// copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
......@@ -182,6 +174,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
}
}
void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
}
void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
hl_stream_t stream) {
dataId = src.dataId;
......@@ -199,6 +196,14 @@ void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
resizeAndCopy(strs, src.strs, useGpu, stream);
}
int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu) {
int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
return size;
}
int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu,
hl_stream_t stream) {
......
......@@ -205,11 +205,14 @@ struct Argument {
* return value: how many samples are copied
*/
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu = FLAGS_use_gpu,
hl_stream_t stream = HPPL_STREAM_DEFAULT);
int32_t copySize, bool useGpu, hl_stream_t stream);
void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu,
hl_stream_t stream = HPPL_STREAM_DEFAULT);
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu = FLAGS_use_gpu);
void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
/*
@brief Concatenate several arguments into one and put the result into it.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册