/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #include #include "Layer.h" #include "MKLDNNBase.h" #include "mkldnn.hpp" #include "paddle/math/MKLDNNMatrix.h" #include "paddle/utils/Stat.h" DECLARE_bool(use_mkldnn); namespace paddle { class MKLDNNLayer; typedef std::shared_ptr MKLDNNLayerPtr; /** * @brief Base class of MKLDNNlayer. * */ class MKLDNNLayer : public Layer { protected: // input value element count size_t inputElemenCnt_; // batch size int bs_; // input image channel, height and width int ic_, ih_, iw_; // output image channel, height and width int oc_, oh_, ow_; // backward also need reset after reset forward handle bool needResetBwd_; // is output only mkldnn bool outputOnlyMKLDNN_; // mkldnn engine, stream and primivtives mkldnn::engine engine_; std::shared_ptr stream_; std::shared_ptr fwd_; std::shared_ptr bwdWgt_; std::shared_ptr bwdData_; std::vector pipelineFwd_; std::vector pipelineBwd_; // MKLDNNMatrixPtr with internal format MKLDNNMatrixPtr inVal_; MKLDNNMatrixPtr inGrad_; MKLDNNMatrixPtr outVal_; MKLDNNMatrixPtr outGrad_; MKLDNNMatrixPtr wgtVal_; MKLDNNMatrixPtr wgtGrad_; MKLDNNMatrixPtr biasVal_; MKLDNNMatrixPtr biasGrad_; // merge grad primitive std::shared_ptr mergeGrad_; std::vector pipelineMergeGrad_; // tmp input argument to save input grad, only used to merge grad Argument tmpInArg_; // since mkldnn sum do not support different formats: // can refer to https://github.com/01org/mkl-dnn/issues/134 // so need create reorder manually and save tmp MKLDNNMatrix MKLDNNMatrixPtr tmpOutGrad_; std::shared_ptr tmpCvt_; public: explicit MKLDNNLayer(const LayerConfig& config) : Layer(config), inputElemenCnt_(0), bs_(0), ic_(0), ih_(0), iw_(0), oc_(0), oh_(0), ow_(0), needResetBwd_(true), engine_(mkldnn::engine::cpu, 0), stream_(nullptr), fwd_(nullptr), bwdWgt_(nullptr), bwdData_(nullptr) {} ~MKLDNNLayer() {} virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) { CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." << "Please set WITH_MKLDNN=ON " << "and set use_mkldnn=True"; CHECK(!useGpu_) << "Do not support GPU yet"; // set device id before Layer::init setDevice(MKLDNN_DEVICE); // change param device to MKLDNN device setParamsDevice(MKLDNN_DEVICE, parameterMap); if (!Layer::init(layerMap, parameterMap)) { return false; } setOutputMap(); checkCPUOutputsNumber(); stream_.reset(new MKLDNNStream()); engine_ = CPUEngine::Instance().getEngine(); return true; } void forward(PassType passType) override { passType_ = passType; { REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); CHECK(!inputLayers_.empty()); copySeqInfoToOutputs(); size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt(); if (inputElemenCnt_ != elemenCnt) { VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; // reset when input total sizes changed, not only the batchsize inputElemenCnt_ = elemenCnt; pipelineFwd_.clear(); reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); convertWeightsFromPaddle(); needResetBwd_ = true; } if (inputLayers_[0]->getType() == "data") { updateInputData(); } if (!outputOnlyMKLDNN_) { clearGrads(); } stream_->submit(pipelineFwd_); } /* activation */ { REGISTER_TIMER_INFO("FwActTimer", getName().c_str()); forwardActivation(); } } void backward(const UpdateCallback& callback) override { if (needResetBwd_) { VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; pipelineBwd_.clear(); pipelineMergeGrad_.clear(); mergeGrad_ = nullptr; resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); needResetBwd_ = false; } // merge grad must before backward activation if (mergeGrad_) { REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str()); stream_->submit(pipelineMergeGrad_); } { REGISTER_TIMER_INFO("BpActTimer", getName().c_str()); backwardActivation(); } { REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); stream_->submit(pipelineBwd_); } { REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); updateWeights(callback); } } /** * reshape the input image sizes * and reset output image and buffer size * output channel can not be changed */ virtual void reshape( int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0; /** * reset the mkldnn forward primitve and memory * only would be called when input size changes */ virtual void resetFwd(std::vector& pipeline, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) = 0; /** * reset the mkldnn backward primitve and memory for mkldnn fc * only would be called when needed */ virtual void resetBwd(std::vector& pipeline, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) = 0; /** * Update input value data when input layer is "data" type. * Since the input value data address might be changed. */ virtual void updateInputData() {} /** * Update weights and biases if necessary. */ virtual void updateWeights(const UpdateCallback& callback) {} /** * convert weight from paddle format to mkldnn format * weight_ will be override */ virtual void convertWeightsFromPaddle() {} /** * convert mkldnn weight to paddle format * weight_ will be override */ virtual void convertWeightsToPaddle() {} /** * add this interface as public for unit test */ void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); } protected: /** * reshape the input image sizes and input batchsize */ virtual void reshapeInput(int& batchsize, int& height, int& width) { const Argument& input = inputLayers_[0]->getOutput(); batchsize = input.getBatchSize(); int h = input.getFrameHeight(); int w = input.getFrameWidth(); if (h != 0) { height = h; } if (w != 0) { width = w; } } /** * reshape output image sizes */ virtual void reshapeOutput(size_t height, size_t width) { output_.setFrameHeight(height); output_.setFrameWidth(width); for (size_t i = 0; i < outputOtherDevice_.size(); i++) { outputOtherDevice_[i].setFrameHeight(height); outputOtherDevice_[i].setFrameWidth(width); } } /** * reset the output grad matrix from primitive desc. * and reset the merge grad primitive if needed. * note: when this layer has serval outputs, * it could not be mixed with cpu device, * since it can not get memory desc from cpu device. */ virtual void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc pd) { CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet"; mergeGrad_ = nullptr; pipelineMergeGrad_.clear(); out = MKLDNNMatrix::create(output_.grad, pd); if (outputMap_.size() <= 1) { return; } std::vector scales(outputMap_.size(), 1.0); std::vector srcPDs; std::vector srcs; for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { MKLDNNMatrixPtr src = std::dynamic_pointer_cast(it->second->grad); VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first; CHECK(src) << "should be MKLDNNMatrix"; auto srcDims = src->getDims(); auto dstDims = out->getDims(); CHECK_EQ(srcDims.size(), dstDims.size()); for (size_t i = 0; i < srcDims.size(); ++i) { CHECK_EQ(srcDims[i], dstDims[i]); } srcPDs.push_back(src->getPrimitiveDesc()); srcs.push_back(*src); } // TODO(TJ): remove me when mkldnn sum support different formats for (size_t i = 1; i < srcPDs.size(); ++i) { CHECK(srcPDs[0] == srcPDs[i]); } tmpOutGrad_ = nullptr; tmpCvt_ = nullptr; if (out->getPrimitiveDesc() != srcPDs[0]) { tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]); tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); CHECK(tmpCvt_); pipelineMergeGrad_.push_back(*tmpCvt_); } else { tmpOutGrad_ = out; } auto sumPD = mkldnn::sum::primitive_desc( tmpOutGrad_->getMemoryDesc(), scales, srcPDs); mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_)); pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); } /** * reset input grad from primitive desc. * this function is avaiable for input is only mkldnn * or input do not care cpu device */ virtual void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc pd) { LayerPtr& input = inputLayers_[0]; const MatrixPtr& grad = input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad; in = MKLDNNMatrix::create(grad, pd); Argument& arg = input->getOutput(this->getName()); arg.grad = std::dynamic_pointer_cast(in); } /** * print info about sizes */ virtual void printSizeInfo() { VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_ << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_ << ", oh: " << oh_ << ", ow: " << ow_; } /** * Print the mkldnn memory format flow of value */ virtual void printValueFormatFlow() { if (inVal_ && outVal_) { VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> " << outVal_->getFormat(); } } /** * Print the mkldnn memory format flow of grad */ virtual void printGradFormatFlow() { if (inGrad_ && outGrad_) { VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< " << outGrad_->getFormat(); } } protected: /** * If input only has MKLDNN device. * Otherwise, only support the previous layer using CPU device. */ bool inputIsOnlyMKLDNN(int index = 0) { int prevDevice = getPrev(index)->getDeviceId(); if (prevDevice == MKLDNN_DEVICE) { return true; } else { // do not support GPU yet CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet"; return false; } } /** * If output only has MKLDNN device. * Otherwise, other devices should only using CPU device. */ bool outputIsOnlyMKLDNN() { for (size_t i = 0; i < outputOtherDevice_.size(); i++) { CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE) << "Only support other device is CPU yet"; } outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0; return outputOnlyMKLDNN_; } /** * Set deviceId of this layer. */ void setDevice(int id) { deviceId_ = id; } private: /** * clear all grad */ void clearGrads() { output_.grad->zeroMem(); for (size_t i = 0; i < outputOtherDevice_.size(); i++) { outputOtherDevice_[i].grad->zeroMem(); } } /** * Set deviceId of the params used in this layer. */ void setParamsDevice(int id, const ParameterMap& parameterMap) { for (auto& inputConfig : config_.inputs()) { if (inputConfig.has_input_parameter_name()) { ParameterPtr parameter; std::string name = inputConfig.input_parameter_name(); CHECK(mapGet(name, parameterMap, ¶meter)) << "Cannot find input parameter " << name << " for layer " << getName(); parameter->setDevice(id); } } if (config_.has_bias_parameter_name()) { ParameterPtr parameter; std::string name = config_.bias_parameter_name(); CHECK(mapGet(name, parameterMap, ¶meter)) << "Cannot find bias parameter " << name << " for layer " << getName(); parameter->setDevice(id); } } /** * Set output map of prev layers. */ void setOutputMap() { outputMap_.clear(); for (size_t i = 0; i < inputLayers_.size(); ++i) { inputLayers_[i]->setOutput(getName(), &tmpInArg_); } } /** * Check the cpu device number of outputOtherDevice_. * should have only one at most. */ void checkCPUOutputsNumber(int max = 1) { int cnt = 0; for (size_t i = 0; i < outputOtherDevice_.size(); i++) { if (outputOtherDevice_[i].deviceId == CPU_DEVICE) { ++cnt; } } CHECK_LE(cnt, max) << "too much CPU devies"; } /** * copy SeqInfo from input layer to this output and other output devices. * @note: do not use getInput(0) since it used this deviceId_, * use "inputLayers_[0]->getOutput()" instead. */ void copySeqInfoToOutputs() { if (inputLayers_.empty() || !needSequenceInfo_) { return; } const Argument& input = inputLayers_[0]->getOutput(); output_.sequenceStartPositions = input.sequenceStartPositions; output_.subSequenceStartPositions = input.subSequenceStartPositions; output_.cpuSequenceDims = input.cpuSequenceDims; for (size_t i = 0; i < outputOtherDevice_.size(); i++) { outputOtherDevice_[i].sequenceStartPositions = output_.sequenceStartPositions; outputOtherDevice_[i].subSequenceStartPositions = output_.subSequenceStartPositions; outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims; } } }; } // namespace paddle