提交 6715beaa 编写于 作者: T tensor-tang

enable merge output grad of mkldnn

上级 6604d7cd
...@@ -86,6 +86,7 @@ protected: ...@@ -86,6 +86,7 @@ protected:
/// Also used in 'use_mkldnn' case. /// Also used in 'use_mkldnn' case.
std::vector<Argument> outputOtherDevice_; std::vector<Argument> outputOtherDevice_;
/// If there are several outputs, map them by each name. /// If there are several outputs, map them by each name.
/// MKLDNNLayer use it only to merge output grad
std::map<std::string, Argument*> outputMap_; std::map<std::string, Argument*> outputMap_;
/// Used to merge grad on different devices. /// Used to merge grad on different devices.
MatrixPtr tmpGrad_; MatrixPtr tmpGrad_;
...@@ -325,6 +326,11 @@ public: ...@@ -325,6 +326,11 @@ public:
outputMap_[name] = output; outputMap_[name] = output;
} }
/**
* Get the output map size, if layer has multi-output.
*/
size_t getOutputMapSize() { return outputMap_.size(); }
/** /**
* Get the output based on layer's name. * Get the output based on layer's name.
*/ */
......
...@@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline( ...@@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline(
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
pipeline.clear();
if (cvtInVal_) { if (cvtInVal_) {
pipeline.push_back(*cvtInVal_); pipeline.push_back(*cvtInVal_);
} }
...@@ -412,8 +410,6 @@ void MKLDNNConvLayer::resetBwdPipeline( ...@@ -412,8 +410,6 @@ void MKLDNNConvLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
pipeline.clear();
if (cvtOutGrad_) { if (cvtOutGrad_) {
pipeline.push_back(*cvtOutGrad_); pipeline.push_back(*cvtOutGrad_);
} }
...@@ -446,28 +442,27 @@ void MKLDNNConvLayer::resetBwdPipeline( ...@@ -446,28 +442,27 @@ void MKLDNNConvLayer::resetBwdPipeline(
void MKLDNNConvLayer::resetOutGrad( void MKLDNNConvLayer::resetOutGrad(
std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) { std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
const MatrixPtr& outMat = output_.grad;
out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
CHECK(outVal_ != nullptr &&
out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
<< "primitive desc of out grad and value should be equal";
// TODO(TJ): merge outgrad
// create reorder if has output grad does not match
cpuOutGrad_ = nullptr; cpuOutGrad_ = nullptr;
cvtOutGrad_ = nullptr; cvtOutGrad_ = nullptr;
if (!outputIsOnlyMKLDNN()) { CHECK(outVal_ != nullptr &&
outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
<< "primitive desc of out grad and value should be equal";
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
outMat->setData(cpuOut->getData());
// same PrimitiveDesc with cpuInVal_ // same PrimitiveDesc with cpuInVal_
CHECK(cpuOutVal_); CHECK(cpuOutVal_);
cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc()); cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) { // create reorder if primitive desc does not match
out = cpuOutGrad_; if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
} else { out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
CHECK(cvtOutGrad_); CHECK(cvtOutGrad_);
} else {
// share the same data of CPU output
output_.grad->setData(cpuOut->getData());
out = cpuOutGrad_;
} }
} }
} }
...@@ -496,32 +491,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad( ...@@ -496,32 +491,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad(
void MKLDNNConvLayer::resetInGrad( void MKLDNNConvLayer::resetInGrad(
std::shared_ptr<conv_bwdData::primitive_desc>& dataPD, std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
MKLDNNMatrixPtr& in) { MKLDNNMatrixPtr& in) {
in = nullptr;
cpuInGrad_ = nullptr;
cvtInGrad_ = nullptr;
if (dataPD == nullptr) { if (dataPD == nullptr) {
return; return;
} }
// TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done if (inputIsOnlyMKLDNN()) {
in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad, MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
dataPD->diff_src_primitive_desc()); CHECK(nullptr != inVal_ &&
CHECK(nullptr != inVal_ && in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
in->getPrimitiveDesc() == inVal_->getPrimitiveDesc()) << "primitive desc of input grad and value should be equal";
<< "primitive desc of input grad and value should be equal"; } else {
// create reorder if has output grad does not match
cpuInGrad_ = nullptr;
cvtInGrad_ = nullptr;
if (!inputIsOnlyMKLDNN()) {
const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE); const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
// same PrimitiveDesc with cpuInVal_ // same PrimitiveDesc with cpuInVal_
CHECK(cpuInVal_); CHECK(cpuInVal_);
cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc()); cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) { in = cpuInGrad_;
const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE); // create reorder if PrimitiveDesc does not match
in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc()); if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
dataPD->diff_src_primitive_desc());
cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_); cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
CHECK(cvtInGrad_); CHECK(cvtInGrad_);
} else {
in = cpuInGrad_;
} }
} }
} }
......
...@@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline( ...@@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
pipeline.clear();
if (bias) { if (bias) {
fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out)); fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
} else { } else {
...@@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, ...@@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
} }
void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) { void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
// TODO(TJ): merge outgrad
int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
output_.grad->setData(getOutput(device).grad->getData());
// for MKLDNN device:
// can not directly cast outputgrad to mkldnnmatrix,
// since each layer can not write the inputgrad to mkldnn inputgrad.
// So just create from matrix with outputvalue format.
// for CPU device:
// fc do not need to convert from cpu device since output is always nc format
// only need create from cpu device
CHECK(outVal_); CHECK(outVal_);
out = if (outputIsOnlyMKLDNN()) {
MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc()); MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
output_.grad->setData(cpuOut->getData());
out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
}
} }
void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
...@@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, ...@@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) { void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
in = nullptr; in = nullptr;
const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad; if (inputLayers_[0]->getOutput().grad == nullptr) {
if (inGrad == nullptr) {
return; return;
} }
// TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
CHECK(inVal_); CHECK(inVal_);
in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc()); MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
} }
void MKLDNNFcLayer::resetBwdWgtPD( void MKLDNNFcLayer::resetBwdWgtPD(
...@@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline( ...@@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
pipeline.clear();
CHECK(inVal_); CHECK(inVal_);
if (bias) { if (bias) {
bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias)); bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
......
...@@ -65,6 +65,11 @@ protected: ...@@ -65,6 +65,11 @@ protected:
MKLDNNMatrixPtr biasVal_; MKLDNNMatrixPtr biasVal_;
MKLDNNMatrixPtr biasGrad_; MKLDNNMatrixPtr biasGrad_;
// merge grad primitive
std::shared_ptr<mkldnn::primitive> mergeGrad_;
// tmp input argument to save input grad, only used to merge grad
Argument tmpInArg_;
public: public:
explicit MKLDNNLayer(const LayerConfig& config) explicit MKLDNNLayer(const LayerConfig& config)
: Layer(config), : Layer(config),
...@@ -99,6 +104,7 @@ public: ...@@ -99,6 +104,7 @@ public:
if (!Layer::init(layerMap, parameterMap)) { if (!Layer::init(layerMap, parameterMap)) {
return false; return false;
} }
setOutputMap();
checkCPUOutputsNumber(); checkCPUOutputsNumber();
stream_.reset(new MKLDNNStream()); stream_.reset(new MKLDNNStream());
...@@ -118,6 +124,7 @@ public: ...@@ -118,6 +124,7 @@ public:
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
// reset when input total sizes changed, not only the batchsize // reset when input total sizes changed, not only the batchsize
inputElemenCnt_ = elemenCnt; inputElemenCnt_ = elemenCnt;
pipelineFwd_.clear();
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
if (outVal_) { if (outVal_) {
...@@ -144,6 +151,7 @@ public: ...@@ -144,6 +151,7 @@ public:
void backward(const UpdateCallback& callback) override { void backward(const UpdateCallback& callback) override {
if (needResetBwd_) { if (needResetBwd_) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
pipelineBwd_.clear();
resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
needResetBwd_ = false; needResetBwd_ = false;
} }
...@@ -247,6 +255,58 @@ protected: ...@@ -247,6 +255,58 @@ protected:
} }
} }
/**
* reset the output grad matrix from primitive desc.
* and reset the merge grad primitive if needed.
* note: when this layer have serval output,
* do not support mixing with cpu device,
* because can not get memory desc from cpu device.
*/
virtual void resetOutGrad(MKLDNNMatrixPtr& out,
mkldnn::memory::primitive_desc pd) {
CHECK(outputIsOnlyMKLDNN()) << "only support mixed with other device yet";
mergeGrad_ = nullptr;
out = MKLDNNMatrix::create(output_.grad, pd);
if (outputMap_.size() <= 1) {
return;
}
std::vector<double> scales;
std::vector<mkldnn::memory::primitive_desc> srcPDs;
std::vector<mkldnn::primitive::at> srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
MKLDNNMatrixPtr src =
std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
CHECK(src) << "should be MKLDNNMatrix";
auto srcDims = src->getDims();
auto dstDims = out->getDims();
CHECK_EQ(srcDims.size(), dstDims.size());
for (size_t i = 0; i < srcDims.size(); ++i) {
CHECK_EQ(srcDims[i], dstDims[i]);
}
srcPDs.push_back(src->getPrimitiveDesc());
srcs.push_back(*src);
scales.push_back(1.0);
}
auto sumPD = mkldnn::sum::primitive_desc(pd.desc(), scales, srcPDs);
mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *out));
pipelineBwd_.insert(pipelineBwd_.begin(), *mergeGrad_);
}
/**
* reset input grad from primitive desc.
* this function is avaiable for input is only mkldnn
* or input do not care cpu device
*/
virtual void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc pd) {
LayerPtr& input = inputLayers_[0];
const MatrixPtr& grad =
input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
in = MKLDNNMatrix::create(grad, pd);
auto arg = input->getOutput(this->getName());
arg.grad = std::dynamic_pointer_cast<Matrix>(in);
}
/** /**
* print info about sizes * print info about sizes
*/ */
...@@ -334,6 +394,16 @@ private: ...@@ -334,6 +394,16 @@ private:
} }
} }
/**
* Set output map of prev layers.
*/
void setOutputMap() {
outputMap_.clear();
for (size_t i = 0; i < inputLayers_.size(); ++i) {
inputLayers_[i]->setOutput(getName(), &tmpInArg_);
}
}
/** /**
* Check the cpu device number of outputOtherDevice_. * Check the cpu device number of outputOtherDevice_.
* should have only one at most. * should have only one at most.
......
...@@ -187,7 +187,6 @@ void MKLDNNPoolLayer::resetFwdPipeline( ...@@ -187,7 +187,6 @@ void MKLDNNPoolLayer::resetFwdPipeline(
std::shared_ptr<pool_fwd::primitive_desc>& pd, std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
pipeline.clear();
fwd_ = workspace_ fwd_ = workspace_
? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_)) ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
: std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out)); : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
...@@ -205,17 +204,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, ...@@ -205,17 +204,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
resetInGrad(in); resetInGrad(in);
} }
void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
CHECK(outVal_) << "Should have output value";
out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
// create reorder if output value has cpu device and pd do not match
cpuOutGrad_ = nullptr; cpuOutGrad_ = nullptr;
cvtOutGrad_ = nullptr; cvtOutGrad_ = nullptr;
if (!outputIsOnlyMKLDNN()) { CHECK(outVal_);
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
cpuOutGrad_ = MKLDNNMatrix::create( cpuOutGrad_ = MKLDNNMatrix::create(
cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_); cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) { if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
CHECK(cvtOutGrad_) << "should not be emptry"; CHECK(cvtOutGrad_) << "should not be emptry";
} else { } else {
...@@ -228,12 +227,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { ...@@ -228,12 +227,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) { void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
in = nullptr; in = nullptr;
const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad; if (inputLayers_[0]->getOutput().grad == nullptr) {
if (inGrad == nullptr) {
return; return;
} }
CHECK(inVal_); CHECK(inVal_);
in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc()); MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
} }
void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd, void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
...@@ -261,7 +259,6 @@ void MKLDNNPoolLayer::resetBwdPipeline( ...@@ -261,7 +259,6 @@ void MKLDNNPoolLayer::resetBwdPipeline(
std::shared_ptr<pool_bwd::primitive_desc>& pd, std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
pipeline.clear();
if (cvtOutGrad_) { if (cvtOutGrad_) {
pipeline.push_back(*cvtOutGrad_); pipeline.push_back(*cvtOutGrad_);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册