diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h index 4002a3d0747a86ab7b495ffe52247521831b71b8..9813a556076bc2666869a85225feaf10f345217a 100644 --- a/paddle/gserver/layers/Layer.h +++ b/paddle/gserver/layers/Layer.h @@ -86,6 +86,7 @@ protected: /// Also used in 'use_mkldnn' case. std::vector outputOtherDevice_; /// If there are several outputs, map them by each name. + /// MKLDNNLayer use it only to merge output grad std::map outputMap_; /// Used to merge grad on different devices. MatrixPtr tmpGrad_; @@ -325,6 +326,11 @@ public: outputMap_[name] = output; } + /** + * Get the output map size, if layer has multi-output. + */ + size_t getOutputMapSize() { return outputMap_.size(); } + /** * Get the output based on layer's name. */ diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp index 0d6742e909635c1097b4fe21bbb304f8a71af5cb..93b35e46a75313a31c1f379cde617c1d0d7ab68c 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.cpp +++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp @@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - pipeline.clear(); - if (cvtInVal_) { pipeline.push_back(*cvtInVal_); } @@ -412,8 +410,6 @@ void MKLDNNConvLayer::resetBwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - pipeline.clear(); - if (cvtOutGrad_) { pipeline.push_back(*cvtOutGrad_); } @@ -446,28 +442,27 @@ void MKLDNNConvLayer::resetBwdPipeline( void MKLDNNConvLayer::resetOutGrad( std::shared_ptr& wgtPD, MKLDNNMatrixPtr& out) { - const MatrixPtr& outMat = output_.grad; - out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc()); - CHECK(outVal_ != nullptr && - out->getPrimitiveDesc() == outVal_->getPrimitiveDesc()) - << "primitive desc of out grad and value should be equal"; - - // TODO(TJ): merge outgrad - // create reorder if has output grad does not match cpuOutGrad_ = nullptr; cvtOutGrad_ = nullptr; - if (!outputIsOnlyMKLDNN()) { + CHECK(outVal_ != nullptr && + outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc()) + << "primitive desc of out grad and value should be equal"; + if (outputIsOnlyMKLDNN()) { + MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); + } else { const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; - outMat->setData(cpuOut->getData()); // same PrimitiveDesc with cpuInVal_ CHECK(cpuOutVal_); cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc()); - if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) { - out = cpuOutGrad_; - } else { - out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc()); + // create reorder if primitive desc does not match + if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { + out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc()); cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); CHECK(cvtOutGrad_); + } else { + // share the same data of CPU output + output_.grad->setData(cpuOut->getData()); + out = cpuOutGrad_; } } } @@ -496,32 +491,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad( void MKLDNNConvLayer::resetInGrad( std::shared_ptr& dataPD, MKLDNNMatrixPtr& in) { + in = nullptr; + cpuInGrad_ = nullptr; + cvtInGrad_ = nullptr; if (dataPD == nullptr) { return; } - // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done - in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad, - dataPD->diff_src_primitive_desc()); - CHECK(nullptr != inVal_ && - in->getPrimitiveDesc() == inVal_->getPrimitiveDesc()) - << "primitive desc of input grad and value should be equal"; - - // create reorder if has output grad does not match - cpuInGrad_ = nullptr; - cvtInGrad_ = nullptr; - if (!inputIsOnlyMKLDNN()) { + if (inputIsOnlyMKLDNN()) { + MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc()); + CHECK(nullptr != inVal_ && + in->getPrimitiveDesc() == inVal_->getPrimitiveDesc()) + << "primitive desc of input grad and value should be equal"; + } else { const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE); // same PrimitiveDesc with cpuInVal_ CHECK(cpuInVal_); cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc()); - if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) { - const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE); - in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc()); + in = cpuInGrad_; + // create reorder if PrimitiveDesc does not match + if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) { + in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE), + dataPD->diff_src_primitive_desc()); cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_); CHECK(cvtInGrad_); - } else { - in = cpuInGrad_; } } } diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp index e829456d6afd7cc844f752d4571cd9f90c73997f..11d3553abf7d5dbc2f259e382ed0b525c4747f55 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.cpp +++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp @@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - pipeline.clear(); - if (bias) { fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out)); } else { @@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, } void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) { - // TODO(TJ): merge outgrad - int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE; - output_.grad->setData(getOutput(device).grad->getData()); - // for MKLDNN device: - // can not directly cast outputgrad to mkldnnmatrix, - // since each layer can not write the inputgrad to mkldnn inputgrad. - // So just create from matrix with outputvalue format. - // for CPU device: - // fc do not need to convert from cpu device since output is always nc format - // only need create from cpu device CHECK(outVal_); - out = - MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc()); + if (outputIsOnlyMKLDNN()) { + MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); + } else { + const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; + output_.grad->setData(cpuOut->getData()); + out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc()); + } } void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, @@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) { in = nullptr; - const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad; - if (inGrad == nullptr) { + if (inputLayers_[0]->getOutput().grad == nullptr) { return; } - // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done CHECK(inVal_); - in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc()); + MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc()); } void MKLDNNFcLayer::resetBwdWgtPD( @@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - pipeline.clear(); CHECK(inVal_); if (bias) { bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias)); diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index c09fd89462ef4fdaeaae3e122f96b0cc6ce373ea..41d74d08a90589761ae93cfe06bceb33bf9c94cf 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -65,6 +65,11 @@ protected: MKLDNNMatrixPtr biasVal_; MKLDNNMatrixPtr biasGrad_; + // merge grad primitive + std::shared_ptr mergeGrad_; + // tmp input argument to save input grad, only used to merge grad + Argument tmpInArg_; + public: explicit MKLDNNLayer(const LayerConfig& config) : Layer(config), @@ -99,6 +104,7 @@ public: if (!Layer::init(layerMap, parameterMap)) { return false; } + setOutputMap(); checkCPUOutputsNumber(); stream_.reset(new MKLDNNStream()); @@ -118,6 +124,7 @@ public: VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; // reset when input total sizes changed, not only the batchsize inputElemenCnt_ = elemenCnt; + pipelineFwd_.clear(); reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); if (outVal_) { @@ -144,6 +151,7 @@ public: void backward(const UpdateCallback& callback) override { if (needResetBwd_) { VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; + pipelineBwd_.clear(); resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); needResetBwd_ = false; } @@ -247,6 +255,58 @@ protected: } } + /** + * reset the output grad matrix from primitive desc. + * and reset the merge grad primitive if needed. + * note: when this layer have serval output, + * do not support mixing with cpu device, + * because can not get memory desc from cpu device. + */ + virtual void resetOutGrad(MKLDNNMatrixPtr& out, + mkldnn::memory::primitive_desc pd) { + CHECK(outputIsOnlyMKLDNN()) << "only support mixed with other device yet"; + mergeGrad_ = nullptr; + out = MKLDNNMatrix::create(output_.grad, pd); + if (outputMap_.size() <= 1) { + return; + } + std::vector scales; + std::vector srcPDs; + std::vector srcs; + for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { + MKLDNNMatrixPtr src = + std::dynamic_pointer_cast(it->second->grad); + CHECK(src) << "should be MKLDNNMatrix"; + auto srcDims = src->getDims(); + auto dstDims = out->getDims(); + CHECK_EQ(srcDims.size(), dstDims.size()); + for (size_t i = 0; i < srcDims.size(); ++i) { + CHECK_EQ(srcDims[i], dstDims[i]); + } + srcPDs.push_back(src->getPrimitiveDesc()); + srcs.push_back(*src); + scales.push_back(1.0); + } + auto sumPD = mkldnn::sum::primitive_desc(pd.desc(), scales, srcPDs); + mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *out)); + pipelineBwd_.insert(pipelineBwd_.begin(), *mergeGrad_); + } + + /** + * reset input grad from primitive desc. + * this function is avaiable for input is only mkldnn + * or input do not care cpu device + */ + virtual void resetInGrad(MKLDNNMatrixPtr& in, + mkldnn::memory::primitive_desc pd) { + LayerPtr& input = inputLayers_[0]; + const MatrixPtr& grad = + input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad; + in = MKLDNNMatrix::create(grad, pd); + auto arg = input->getOutput(this->getName()); + arg.grad = std::dynamic_pointer_cast(in); + } + /** * print info about sizes */ @@ -334,6 +394,16 @@ private: } } + /** + * Set output map of prev layers. + */ + void setOutputMap() { + outputMap_.clear(); + for (size_t i = 0; i < inputLayers_.size(); ++i) { + inputLayers_[i]->setOutput(getName(), &tmpInArg_); + } + } + /** * Check the cpu device number of outputOtherDevice_. * should have only one at most. diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp index b62dfb7c54258a593aa50d5b30096423f375c69d..5de23e1378836bd3baee1d9c8942a9a575c9dd06 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp +++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp @@ -187,7 +187,6 @@ void MKLDNNPoolLayer::resetFwdPipeline( std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { - pipeline.clear(); fwd_ = workspace_ ? std::make_shared(pool_fwd(*pd, *in, *out, *workspace_)) : std::make_shared(pool_fwd(*pd, *in, *out)); @@ -205,17 +204,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, resetInGrad(in); } void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { - CHECK(outVal_) << "Should have output value"; - out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc()); - - // create reorder if output value has cpu device and pd do not match cpuOutGrad_ = nullptr; cvtOutGrad_ = nullptr; - if (!outputIsOnlyMKLDNN()) { + CHECK(outVal_); + if (outputIsOnlyMKLDNN()) { + MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); + } else { const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; cpuOutGrad_ = MKLDNNMatrix::create( cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_); - if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) { + if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { + out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc()); cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); CHECK(cvtOutGrad_) << "should not be emptry"; } else { @@ -228,12 +227,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) { in = nullptr; - const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad; - if (inGrad == nullptr) { + if (inputLayers_[0]->getOutput().grad == nullptr) { return; } CHECK(inVal_); - in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc()); + MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc()); } void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr& pd, @@ -261,7 +259,6 @@ void MKLDNNPoolLayer::resetBwdPipeline( std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { - pipeline.clear(); if (cvtOutGrad_) { pipeline.push_back(*cvtOutGrad_); }