提交 0f91ea7e 编写于 作者: H Haonan 提交者: Yu Yang

use HPPL_STREAM_DEFAULT for layer computation

Change-Id: Id66da7b7f5bf9ec80cc19b347e4fb822a5a6f197
上级 688eeefa
...@@ -194,8 +194,9 @@ public: ...@@ -194,8 +194,9 @@ public:
virtual real evalImp(std::vector<Argument>& arguments) { virtual real evalImp(std::vector<Argument>& arguments) {
CHECK_EQ(arguments.size(), (size_t)2); CHECK_EQ(arguments.size(), (size_t)2);
Argument output, label; Argument output, label;
output.resizeAndCopyFrom(arguments[0], false); output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
label.resizeAndCopyFrom(arguments[1], false); label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
CHECK(label.sequenceStartPositions); CHECK(label.sequenceStartPositions);
CHECK(label.ids); CHECK(label.ids);
size_t numSequences = label.sequenceStartPositions->getSize() - 1; size_t numSequences = label.sequenceStartPositions->getSize() - 1;
......
...@@ -878,7 +878,11 @@ void TrainerThread::copyOutputGrad() { ...@@ -878,7 +878,11 @@ void TrainerThread::copyOutputGrad() {
outArgs_.resize(outputGradArgs.size()); outArgs_.resize(outputGradArgs.size());
for (size_t i = 0; i < outputGradArgs.size(); i++) { for (size_t i = 0; i < outputGradArgs.size(); i++) {
outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize, outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
multiMachine_->useGpu()); multiMachine_->useGpu(),
HPPL_STREAM_DEFAULT);
}
if (multiMachine_->useGpu()) {
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
} }
gradientMachine_->setOutputGrad(outArgs_); gradientMachine_->setOutputGrad(outArgs_);
} }
......
...@@ -49,9 +49,10 @@ void CTCLayer::forward(PassType passType) { ...@@ -49,9 +49,10 @@ void CTCLayer::forward(PassType passType) {
Layer::forward(passType); Layer::forward(passType);
if (useGpu_) { if (useGpu_) {
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); tmpCpuInput_[i].resizeAndCopyFrom(
getInput(i), false, HPPL_STREAM_DEFAULT);
} }
hl_stream_synchronize(HPPL_STREAM_1); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]); forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
} else { } else {
forwardImp(getInput(0), getInput(1)); forwardImp(getInput(0), getInput(1));
...@@ -93,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) { ...@@ -93,9 +94,9 @@ void CTCLayer::backward(const UpdateCallback &callback) {
if (useGpu_) { if (useGpu_) {
backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]); backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
const_cast<Argument&>(getInput(0)). const_cast<Argument&>(getInput(0)).
resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1); resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
const_cast<Argument&>(getInput(1)). const_cast<Argument&>(getInput(1)).
resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1); resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
} else { } else {
backwardImp(callback, getInput(0), getInput(1)); backwardImp(callback, getInput(0), getInput(1));
} }
......
...@@ -248,7 +248,7 @@ void ConvOperator::forward() { ...@@ -248,7 +248,7 @@ void ConvOperator::forward() {
CHECK_EQ(ins_[1]->value->getHeight(), batchSize); CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
checkFilterSize(ins_[1]->value); checkFilterSize(ins_[1]->value);
Matrix::resizeOrCreate(out_->value, batchSize, Matrix::resizeOrCreate(out_->value, batchSize,
outputH_ * outputW_ * numFilters_); outputH_ * outputW_ * numFilters_, false, useGpu_);
{ {
AsyncGpuBlock block; AsyncGpuBlock block;
for (size_t batchId = 0; batchId < batchSize; ++batchId) { for (size_t batchId = 0; batchId < batchSize; ++batchId) {
......
...@@ -509,9 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, ...@@ -509,9 +509,10 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
Matrix &cost) { Matrix &cost) {
if (useGpu_) { if (useGpu_) {
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); tmpCpuInput_[i].resizeAndCopyFrom(
getInput(i), false, HPPL_STREAM_DEFAULT);
} }
hl_stream_synchronize(HPPL_STREAM_1); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
} }
forwardImpIn(output, label, cost); forwardImpIn(output, label, cost);
} }
......
...@@ -52,9 +52,10 @@ public: ...@@ -52,9 +52,10 @@ public:
Layer::forward(passType); Layer::forward(passType);
if (useGpu_) { if (useGpu_) {
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1); tmpCpuInput_[i].resizeAndCopyFrom(
getInput(i), false, HPPL_STREAM_DEFAULT);
} }
hl_stream_synchronize(HPPL_STREAM_1); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
forwardImp(tmpCpuInput_[0]); forwardImp(tmpCpuInput_[0]);
} else { } else {
forwardImp(getInput(0)); forwardImp(getInput(0));
......
...@@ -22,8 +22,12 @@ namespace paddle { ...@@ -22,8 +22,12 @@ namespace paddle {
static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
hl_stream_t stream) { hl_stream_t stream) {
if (src) { if (src) {
Matrix::resizeOrCreate(dest, src->getHeight(), if (!dest) {
src->getWidth(), false, useGpu); dest = src->clone(0, 0, useGpu);
} else {
CHECK_EQ(dest->useGpu(), useGpu);
dest->resize(src->getHeight(), src->getWidth());
}
dest->copyFrom(*src, stream); dest->copyFrom(*src, stream);
} else { } else {
dest.reset(); dest.reset();
...@@ -59,7 +63,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, ...@@ -59,7 +63,12 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
CHECK_LE((size_t)startRow + copySize, src->getHeight()); CHECK_LE((size_t)startRow + copySize, src->getHeight());
int height = copySize; int height = copySize;
int width = src->getWidth(); int width = src->getWidth();
Matrix::resizeOrCreate(dest, height, width, false, useGpu); if (!dest) {
dest = src->clone(height, width, useGpu);
} else {
CHECK_EQ(dest->useGpu(), useGpu);
dest->resize(height, width);
}
MatrixPtr submat = src->subMatrix(startRow, copySize); MatrixPtr submat = src->subMatrix(startRow, copySize);
if (dynamic_cast<GpuSparseMatrix*>(dest.get())) { if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
// copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix. // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
......
...@@ -203,15 +203,27 @@ struct Argument { ...@@ -203,15 +203,27 @@ struct Argument {
* startSeq: the sample id of start * startSeq: the sample id of start
* copySize: how many samples need to copy * copySize: how many samples need to copy
* return value: how many samples are copied * return value: how many samples are copied
* Note that when specifying the stream explicitly in this case,
* synchronize should also be called somewhere after this function
*/ */
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu, hl_stream_t stream); int32_t copySize, bool useGpu, hl_stream_t stream);
/*
* same with the above function, except that the stream is
* HPPL_STREAM_DEFAULT and synchronize is automatically called
* inside it
*/
int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
int32_t copySize, bool useGpu = FLAGS_use_gpu); int32_t copySize, bool useGpu = FLAGS_use_gpu);
void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream); void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
/*
* same with the above function, except that the stream is
* HPPL_STREAM_DEFAULT and synchronize is automatically called
* inside it
*/
void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu); void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
/* /*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册