提交 0f91ea7e 编写于 作者: H Haonan 提交者: Yu Yang

use HPPL_STREAM_DEFAULT for layer computation

Change-Id: Id66da7b7f5bf9ec80cc19b347e4fb822a5a6f197
上级 688eeefa
......@@ -194,8 +194,9 @@ public:
virtual real evalImp(std::vector<Argument>& arguments) {
CHECK_EQ(arguments.size(), (size_t)2);
Argument output, label;
output.resizeAndCopyFrom(arguments[0], false);
label.resizeAndCopyFrom(arguments[1], false);
output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
CHECK(label.sequenceStartPositions);
CHECK(label.ids);
size_t numSequences = label.sequenceStartPositions->getSize() - 1;
......
......@@ -878,7 +878,11 @@ void TrainerThread::copyOutputGrad() {
outArgs_.resize(outputGradArgs.size());
for (size_t i = 0; i < outputGradArgs.size(); i++) {
outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
multiMachine_->useGpu());
multiMachine_->useGpu(),
HPPL_STREAM_DEFAULT);
}
if (multiMachine_->useGpu()) {
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
}
gradientMachine_->setOutputGrad(outArgs_);
}
......
......@@ -49,9 +49,10 @@ void CTCLayer::forward(PassType passType) {
Layer::forward(passType);
if (useGpu_) {
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
tmpCpuInput_[i].resizeAndCopyFrom(
getInput(i), false, HPPL_STREAM_DEFAULT);
}
hl_stream_synchronize(HPPL_STREAM_1);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
} else {
forwardImp(getInput(0), getInput(1));
......@@ -93,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) {
if (useGpu_) {
backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
const_cast<Argument&>(getInput(0)).
resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
const_cast<Argument&>(getInput(1)).
resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
} else {
backwardImp(callback, getInput(0), getInput(1));
}
......
......@@ -248,7 +248,7 @@ void ConvOperator::forward() {
CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
checkFilterSize(ins_[1]->value);
Matrix::resizeOrCreate(out_->value, batchSize,
outputH_ * outputW_ * numFilters_);
outputH_ * outputW_ * numFilters_, false, useGpu_);
{
AsyncGpuBlock block;
for (size_t batchId = 0; batchId < batchSize; ++batchId) {
......
......@@ -509,9 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
Matrix &cost) {
if (useGpu_) {
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
tmpCpuInput_[i].resizeAndCopyFrom(
getInput(i), false, HPPL_STREAM_DEFAULT);
}
hl_stream_synchronize(HPPL_STREAM_1);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
}
forwardImpIn(output, label, cost);
}
......
......@@ -52,9 +52,10 @@ public:
Layer::forward(passType);
if (useGpu_) {
for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
tmpCpuInput_[i].resizeAndCopyFrom(
getInput(i), false, HPPL_STREAM_DEFAULT);
}
hl_stream_synchronize(HPPL_STREAM_1);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
forwardImp(tmpCpuInput_[0]);
} else {
forwardImp(getInput(0));
......
......@@ -22,8 +22,12 @@ namespace paddle {
static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
hl_stream_t stream) {
if (src) {
Matrix::resizeOrCreate(dest, src->getHeight(),
src->getWidth(), false, useGpu);
if (!dest) {
dest = src->clone(0, 0, useGpu);
} else {
CHECK_EQ(dest->useGpu(), useGpu);
dest->resize(src->getHeight(), src->getWidth());
}
dest->copyFrom(*src, stream);
} else {
dest.reset();
......@@ -59,7 +63,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
CHECK_LE((size_t)startRow + copySize, src->getHeight());
int height = copySize;
int width = src->getWidth();
Matrix::resizeOrCreate(dest, height, width, false, useGpu);
if (!dest) {
dest = src->clone(height, width, useGpu);
} else {
CHECK_EQ(dest->useGpu(), useGpu);
dest->resize(height, width);
}
MatrixPtr submat = src->subMatrix(startRow, copySize);
if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
// copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
......
......@@ -203,15 +203,27 @@ struct Argument {
* startSeq: the sample id of start
* copySize: how many samples need to copy
* return value: how many samples are copied
* Note that when specifying the stream explicitly in this case,
* synchronize should also be called somewhere after this function
*/
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu, hl_stream_t stream);
/*
* same with the above function, except that the stream is
* HPPL_STREAM_DEFAULT and synchronize is automatically called
* inside it
*/
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu = FLAGS_use_gpu);
void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
/*
* same with the above function, except that the stream is
* HPPL_STREAM_DEFAULT and synchronize is automatically called
* inside it
*/
void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
/*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册