提交 322d9ad8 编写于 作者: T Tao Luo 提交者: GitHub

Merge pull request #3712 from tensor-tang/merge

add MKLDNN_DEVICE 
...@@ -51,7 +51,7 @@ ExternalProject_Add( ...@@ -51,7 +51,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9" GIT_TAG "v0.10"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
......
...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject) ...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170720") SET(MKLML_VER "mklml_lnx_2018.0.20170720")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz") SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
......
...@@ -41,7 +41,7 @@ namespace paddle { ...@@ -41,7 +41,7 @@ namespace paddle {
Layer::Layer(const LayerConfig& config, bool useGpu) Layer::Layer(const LayerConfig& config, bool useGpu)
: config_(config), : config_(config),
useGpu_(useGpu), useGpu_(useGpu),
deviceId_(-1), deviceId_(CPU_DEVICE),
needSequenceInfo_(true) {} needSequenceInfo_(true) {}
bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
......
...@@ -59,7 +59,12 @@ protected: ...@@ -59,7 +59,12 @@ protected:
LayerConfig config_; LayerConfig config_;
/// whether to use GPU /// whether to use GPU
bool useGpu_; bool useGpu_;
/// Device Id. CPU is -1, and GPU is 0, 1, 2 ... /// Paddle device ID, MKLDNN is -2, CPU is -1
enum PADDLE_DEVICE_ID {
MKLDNN_DEVICE = -2,
CPU_DEVICE = -1,
};
/// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
int deviceId_; int deviceId_;
/// Input layers /// Input layers
std::vector<LayerPtr> inputLayers_; std::vector<LayerPtr> inputLayers_;
...@@ -77,6 +82,7 @@ protected: ...@@ -77,6 +82,7 @@ protected:
Argument output_; Argument output_;
/// Several outputs stored on different devices, used in 'parallel_nn' case, /// Several outputs stored on different devices, used in 'parallel_nn' case,
/// and record them by deviceId_. /// and record them by deviceId_.
/// Also used in 'use_mkldnn' case.
std::vector<Argument> outputOtherDevice_; std::vector<Argument> outputOtherDevice_;
/// If there are several outputs, map them by each name. /// If there are several outputs, map them by each name.
std::map<std::string, Argument*> outputMap_; std::map<std::string, Argument*> outputMap_;
...@@ -172,6 +178,13 @@ protected: ...@@ -172,6 +178,13 @@ protected:
return inputLayer.getOutput(deviceId_); return inputLayer.getOutput(deviceId_);
} }
/**
* Get the argument of input layer with deviceId.
*/
const Argument& getInput(size_t inputIndex, int deviceId) const {
return inputLayers_[inputIndex]->getOutput(deviceId);
}
/** /**
* Get the forward-input value. * Get the forward-input value.
*/ */
...@@ -186,6 +199,13 @@ protected: ...@@ -186,6 +199,13 @@ protected:
return inputLayer.getOutput(deviceId_).value; return inputLayer.getOutput(deviceId_).value;
} }
/**
* Get the forward-input value with deviceId.
*/
const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).value;
}
/** /**
* Get the forward-input grad. * Get the forward-input grad.
*/ */
...@@ -200,6 +220,13 @@ protected: ...@@ -200,6 +220,13 @@ protected:
return inputLayer.getOutput(deviceId_).grad; return inputLayer.getOutput(deviceId_).grad;
} }
/**
* Get the forward-input grad.
*/
const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).grad;
}
/** /**
* Get the forward-input label. * Get the forward-input label.
*/ */
......
...@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() { ...@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
return; return;
} }
// TODO(TJ): dst format should get from wgtVal_ CHECK(wgtVal_) << "should have been initialized";
int dstFmt = PARAM_FORMAT_MKLDNN_OI; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
int srcFmt = weight_->getParameterPtr()->getHeaderFormat(); auto targetDim = wgtVal_->getDims();
if (srcFmt == dstFmt) { auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
return; wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
}
// The weight_ is transposed from initial paddle weight
MatrixPtr paddleWgt = Matrix::create(
weight_->getW()->getData(), iLayerSize_, oc_, false, false);
// TODO(TJ): remove this print when do not need differ weights
std::ostringstream ostr;
paddleWgt->print(ostr);
VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
// The mkldnn weight is transposed from initial paddle matrix
MatrixPtr paddleWgtT;
paddleWgt->transpose(paddleWgtT, true);
weight_->getW()->copyFrom(*paddleWgtT);
weight_->getParameterPtr()->setHeaderFormat(dstFmt);
hasInitedWgt_ = true; hasInitedWgt_ = true;
} }
void MKLDNNFcLayer::convertWeightsToPaddle() { void MKLDNNFcLayer::convertWeightsToPaddle() {
MatrixPtr dnnWgt = weight_->getW(); CHECK(wgtVal_) << "should have been initialized";
MatrixPtr paddleWgt; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
dnnWgt->transpose(paddleWgt, true); auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
// copy paddle weight and override on weight_ wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
MatrixPtr dnnWgtT = Matrix::create( }
dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
dnnWgtT->copyFrom(*paddleWgt); void MKLDNNFcLayer::convertOutputToOtherDevice() {
copyOutputInfoToOtherDevice();
// find other cpu device and reorder output to cpu device
int cnt = 0;
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
// fc cpu output value do not need convert
// just share point
outputOtherDevice_[i].value = output_.value;
++cnt;
}
}
if (cnt > 1) {
LOG(WARNING) << "should not have more than one CPU devie";
}
} }
void MKLDNNFcLayer::reshape() { void MKLDNNFcLayer::reshape() {
const Argument& input = getInput(0); const Argument& input = getInput(0, getPrev(0)->getDeviceId());
int batchSize = input.getBatchSize(); int batchSize = input.getBatchSize();
if (bs_ == batchSize) { if (bs_ == batchSize) {
return; return;
...@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() { ...@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
if (iw_ == 0) { if (iw_ == 0) {
iw_ = 1; iw_ = 1;
} }
hasSpatial_ = true;
if (ih_ == 1 && iw_ == 1) {
hasSpatial_ = false;
}
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize()); CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
ic_ = iLayerSize_ / (ih_ * iw_); ic_ = iLayerSize_ / (ih_ * iw_);
CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible"; CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
...@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() { ...@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {
void MKLDNNFcLayer::resetFwd() { void MKLDNNFcLayer::resetFwd() {
bool hasBias = biases_ && biases_->getW(); bool hasBias = biases_ && biases_->getW();
real* iData = getInputValue(0)->getData(); const MatrixPtr& wgt = weight_->getW();
real* oData = getOutputValue()->getData(); const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
real* wData = weight_->getW()->getData(); const MatrixPtr& out = output_.value;
real* bData = hasBias ? biases_->getW()->getData() : NULL;
if (inputIsOnlyMKLDNN()) {
// TODO(TJ): below create should be covered in MkldnnMatrix const MatrixPtr& in = getInputValue(0);
// create memory desc inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) CHECK(inVal_) << "Input should be MKLDNNMatrix";
: createMD({bs_, ic_}, format::nc); } else {
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw) CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
: createMD({oc_, ic_}, format::oi); const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
memory::desc bMD = bData != NULL ? createMD({oc_}, format::x) inVal_ = MKLDNNMatrix::create(
: createMD({}, format::format_undef); in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
memory::desc oMD = createMD({bs_, oc_}, format::nc); }
inVal_->downSpatial();
// create memory primitive desc and memory self wgtVal_ = MKLDNNMatrix::create(
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData)); wgtVal_->downSpatial();
outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData)); biasVal_ =
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
// change original output value to mkldnn output value
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
if (!outputIsOnlyMKLDNN()) {
convertOutputToOtherDevice();
}
// create forward handle
prop_kind pk = prop_kind::forward; prop_kind pk = prop_kind::forward;
fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD) fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
: fc_fwd::desc(pk, iMD, wMD, oMD); inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
biasVal_->getMemoryDesc(),
outVal_->getMemoryDesc())
: fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
outVal_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
if (hasBias) {
if (bData != NULL) {
biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_)); fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
} else { } else {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_)); fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
} }
printValueFormatFlow();
pipelineFwd_.clear(); pipelineFwd_.clear();
pipelineFwd_.push_back(*fwd_); pipelineFwd_.push_back(*fwd_);
} }
...@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() { ...@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
return; return;
} }
needResetBwd_ = false; needResetBwd_ = false;
bool hasBias = biases_ && biases_->getWGrad(); bool hasBias = biases_ && biases_->getWGrad();
real* iData = getInputValue(0)->getData();
real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
real* oDiff = getOutputGrad()->getData();
real* wDiff = weight_->getWGrad()->getData();
real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
/// backward weight /// backward weight
// create memory desc for backward memory CHECK(inVal_) << "Should have input value";
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) const MatrixPtr& wgt = weight_->getWGrad();
: createMD({bs_, ic_}, format::nc); const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi); // TODO(TJ): merge outgrad
memory::desc oMD = createMD({bs_, oc_}, format::nc); int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x) // for MKLDNN device:
: createMD({}, format::format_undef); // can not directly cast outputgrad to mkldnnmatrix,
// since each layer can not write the inputgrad to mkldnn inputgrad.
if (inVal_) { // So just create from matrix with outputvalue format.
// update data // for CPU device:
inVal_->set_data_handle(iData); // fc do not need to convert from cpu device since output is always nc format
} else { // only need create from cpu device
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); const MatrixPtr& out = getOutput(device).grad;
} outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
// create memory primitive desc and memory self biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff)); : nullptr;
outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
// create memory primitive desc
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD); fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL fc_bwdWgt::desc bwdWgtDesc = hasBias
? fc_bwdWgt::desc(iMD, wMD, bMD, oMD) ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
: fc_bwdWgt::desc(iMD, wMD, oMD); wgtGrad_->getMemoryDesc(),
biasGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc())
: fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdWgt::primitive_desc bwdWgtPD = fc_bwdWgt::primitive_desc bwdWgtPD =
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD); fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
if (bDiff != NULL) { if (hasBias) {
biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
bwdWgt_.reset( bwdWgt_.reset(
new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_)); new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
} else { } else {
...@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() { ...@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
pipelineBwd_.push_back(*bwdWgt_); pipelineBwd_.push_back(*bwdWgt_);
/// backward data /// backward data
if (iDiff == NULL) { device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
const MatrixPtr& in = getInputGrad(0, device);
if (in == nullptr) {
return; return;
} }
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD); if (getInput(0, device).getAllCount() > 1) {
// TODO(TJ): use outputMaps_ ways when merge outgrad done
} else {
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
}
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdData::primitive_desc bwdDataPD = fc_bwdData::primitive_desc bwdDataPD =
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD); fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
CHECK(wgtVal_) << "Should have weight memory"; CHECK(wgtVal_) << "Should have weight memory";
bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_)); bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
printGradFormatFlow();
pipelineBwd_.push_back(*bwdData_); pipelineBwd_.push_back(*bwdData_);
} }
...@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) { ...@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
{ {
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
syncInputValue();
// update input data
// since it might be changed if this is after data layer
real* iData = getInputValue(0)->getData();
inVal_->set_data_handle(iData);
// just submit forward pipeline // just submit forward pipeline
stream_->submit(pipelineFwd_); stream_->submit(pipelineFwd_);
...@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) { ...@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
resetBwd(); resetBwd();
// update diff syncOutputGrad();
real* oDiff = getOutputGrad()->getData();
outGrad_->set_data_handle(oDiff);
// just sumbmit backward pipeline // just sumbmit backward pipeline
stream_->submit(pipelineBwd_); stream_->submit(pipelineBwd_);
} }
......
...@@ -32,16 +32,13 @@ protected: ...@@ -32,16 +32,13 @@ protected:
// if has already init the weight // if has already init the weight
bool hasInitedWgt_; bool hasInitedWgt_;
// if input layer has image size info (ih>1 && iw>1)
bool hasSpatial_;
// fc weight and bias // fc weight and bias
std::unique_ptr<Weight> weight_; std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> biases_; std::unique_ptr<Weight> biases_;
public: public:
explicit MKLDNNFcLayer(const LayerConfig& config) explicit MKLDNNFcLayer(const LayerConfig& config)
: MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {} : MKLDNNLayer(config), hasInitedWgt_(false) {}
~MKLDNNFcLayer() {} ~MKLDNNFcLayer() {}
...@@ -75,6 +72,8 @@ protected: ...@@ -75,6 +72,8 @@ protected:
* only would be called when needed * only would be called when needed
*/ */
void resetBwd(); void resetBwd();
void convertOutputToOtherDevice() override;
}; };
} // namespace paddle } // namespace paddle
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include "Layer.h" #include "Layer.h"
#include "MKLDNNBase.h" #include "MKLDNNBase.h"
#include "mkldnn.hpp" #include "mkldnn.hpp"
#include "paddle/math/MKLDNNMatrix.h"
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
DECLARE_bool(use_mkldnn_wgt);
namespace paddle { namespace paddle {
...@@ -52,15 +52,15 @@ protected: ...@@ -52,15 +52,15 @@ protected:
std::vector<mkldnn::primitive> pipelineFwd_; std::vector<mkldnn::primitive> pipelineFwd_;
std::vector<mkldnn::primitive> pipelineBwd_; std::vector<mkldnn::primitive> pipelineBwd_;
// TODO(TJ): change below memory as MKLDNNMatrixPtr type // MKLDNNMatrixPtr
std::shared_ptr<mkldnn::memory> inVal_; MKLDNNMatrixPtr inVal_;
std::shared_ptr<mkldnn::memory> inGrad_; MKLDNNMatrixPtr inGrad_;
std::shared_ptr<mkldnn::memory> outVal_; MKLDNNMatrixPtr outVal_;
std::shared_ptr<mkldnn::memory> outGrad_; MKLDNNMatrixPtr outGrad_;
std::shared_ptr<mkldnn::memory> wgtVal_; MKLDNNMatrixPtr wgtVal_;
std::shared_ptr<mkldnn::memory> wgtGrad_; MKLDNNMatrixPtr wgtGrad_;
std::shared_ptr<mkldnn::memory> biasVal_; MKLDNNMatrixPtr biasVal_;
std::shared_ptr<mkldnn::memory> biasGrad_; MKLDNNMatrixPtr biasGrad_;
public: public:
explicit MKLDNNLayer(const LayerConfig& config) explicit MKLDNNLayer(const LayerConfig& config)
...@@ -83,17 +83,21 @@ public: ...@@ -83,17 +83,21 @@ public:
virtual bool init(const LayerMap& layerMap, virtual bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
// set device id before Layer::init
setDevice(MKLDNN_DEVICE);
// change param device to MKLDNN device
setParamsDevice(MKLDNN_DEVICE, parameterMap);
if (!Layer::init(layerMap, parameterMap)) { if (!Layer::init(layerMap, parameterMap)) {
return false; return false;
} }
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
stream_.reset(new MKLDNNStream()); stream_.reset(new MKLDNNStream());
engine_ = CPUEngine::Instance().getEngine(); engine_ = CPUEngine::Instance().getEngine();
// TODO(TJ): deivecId
return true; return true;
} }
...@@ -109,6 +113,12 @@ public: ...@@ -109,6 +113,12 @@ public:
*/ */
virtual void convertWeightsToPaddle() {} virtual void convertWeightsToPaddle() {}
/**
* convert MKLDNN output to other device.
* only support CPU device yet
*/
virtual void convertOutputToOtherDevice() {}
/** /**
* print info about sizes * print info about sizes
*/ */
...@@ -118,14 +128,124 @@ public: ...@@ -118,14 +128,124 @@ public:
<< ", oh: " << oh_ << ", ow: " << ow_; << ", oh: " << oh_ << ", ow: " << ow_;
} }
// TODO(TJ): move to MkldnnMatrix /**
// create memory desc * Print the mkldnn memory format flow of value
inline mkldnn::memory::desc createMD( */
mkldnn::memory::dims dims, virtual void printValueFormatFlow() {
mkldnn::memory::format fmt, if (inVal_ && outVal_) {
mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) { VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
// TODO(TJ): isFmtSuppoted(fmt) << " >>> " << outVal_->getFormat();
return mkldnn::memory::desc(dims, type, fmt); }
}
/**
* Print the mkldnn memory format flow of grad
*/
virtual void printGradFormatFlow() {
if (inGrad_ && outGrad_) {
VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
<< " <<< " << outGrad_->getFormat();
}
}
protected:
/**
* copy image size and sequence info to other device
* @note: can not directly use Layer::copyOutputToOtherDevice since here only
* copy base info and do not copy data value
*/
void copyOutputInfoToOtherDevice() {
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
outputOtherDevice_[i].sequenceStartPositions =
output_.sequenceStartPositions;
outputOtherDevice_[i].subSequenceStartPositions =
output_.subSequenceStartPositions;
outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
}
}
/**
* If input only has MKLDNN device.
* Otherwise, only support the previous layer using CPU device.
*/
bool inputIsOnlyMKLDNN(int index = 0) {
int prevDevice = getPrev(index)->getDeviceId();
if (prevDevice == MKLDNN_DEVICE) {
return true;
} else {
// do not support GPU yet
CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
return false;
}
}
/**
* If output only has MKLDNN device.
* Otherwise, other devices should only using CPU device.
*/
bool outputIsOnlyMKLDNN() {
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
<< "Only support other device is CPU yet";
}
return outputOtherDevice_.size() == 0;
}
/**
* Sync input value data
*/
void syncInputValue() {
if (inputIsOnlyMKLDNN()) {
return;
}
real* iData = getInputValue(0, CPU_DEVICE)->getData();
// update input data
// since it might be changed if this is after data layer
inVal_->updateData(iData);
}
/**
* Sync output grad data
*/
void syncOutputGrad() {
if (outputIsOnlyMKLDNN()) {
return;
}
// update diff
real* oDiff = getOutput(CPU_DEVICE).grad->getData();
outGrad_->updateData(oDiff);
}
/**
* Set deviceId of this layer.
*/
void setDevice(int id) { deviceId_ = id; }
/**
* Set deviceId of the params used in this layer.
*/
void setParamsDevice(int id, const ParameterMap& parameterMap) {
for (auto& inputConfig : config_.inputs()) {
if (inputConfig.has_input_parameter_name()) {
ParameterPtr parameter;
std::string name = inputConfig.input_parameter_name();
CHECK(mapGet(name, parameterMap, &parameter))
<< "Cannot find input parameter " << name << " for layer "
<< getName();
parameter->setDevice(id);
}
}
if (config_.has_bias_parameter_name()) {
ParameterPtr parameter;
std::string name = config_.bias_parameter_name();
CHECK(mapGet(name, parameterMap, &parameter))
<< "Cannot find bias parameter " << name << " for layer "
<< getName();
parameter->setDevice(id);
}
} }
}; };
......
...@@ -48,7 +48,13 @@ public: ...@@ -48,7 +48,13 @@ public:
*/ */
virtual void* alloc(size_t size) { virtual void* alloc(size_t size) {
void* ptr; void* ptr;
#ifdef PADDLE_USE_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment
CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
#else
CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0); CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
#endif
CHECK(ptr) << "Fail to allocate CPU memory: size=" << size; CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
return ptr; return ptr;
} }
......
...@@ -14,6 +14,17 @@ ...@@ -14,6 +14,17 @@
# #
file(GLOB MATH_HEADERS . *.h) file(GLOB MATH_HEADERS . *.h)
file(GLOB MATH_SOURCES . *.cpp) file(GLOB MATH_SOURCES . *.cpp)
if(NOT WITH_MKLDNN)
set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
message(STATUS "Skip compiling with MKLDNNMatrix")
else()
message(STATUS "Compile with MKLDNNMatrix")
endif()
set(MATH_SOURCES set(MATH_SOURCES
"${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu" "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
"${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu" "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNMatrix.h"
using namespace mkldnn; // NOLINT
namespace paddle {
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
memory::desc md = pd.desc();
size_t ndims = md.data.ndims;
int* dims = md.data.dims;
CHECK(ndims > 0) << "Input dims should not be empty";
size_t cnts = 1;
for (size_t i = 0; i < ndims; ++i) {
cnts *= dims[i];
}
if (m == nullptr) {
size_t height = dims[0];
size_t width = cnts / dims[0];
m = Matrix::create(height, width, false, false);
}
CHECK(m) << " Matrix should not be empty";
CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
return std::make_shared<MKLDNNMatrix>(
m->getData(), m->getHeight(), m->getWidth(), pd);
}
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
memory::dims dims,
memory::format fmt,
engine& eg,
mkldnn::memory::data_type dtype) {
return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
}
void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
memory::format srcFmt,
memory::dims targetDim) {
memory::format dstFmt = getFormat();
if (srcFmt == dstFmt) {
return;
}
CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
}
void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
memory::format dstFmt,
memory::dims targetDim) {
memory::format srcFmt = getFormat();
if (srcFmt == dstFmt) {
return;
}
CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
}
void MKLDNNMatrix::reorderOnce(void* srcData,
void* dstData,
memory::format srcFmt,
memory::format dstFmt,
memory::dims dm) {
CHECK(srcData);
CHECK(dstData);
MatrixPtr tmpSrc;
if (dstData == srcData) {
// inplace data
size_t sz = 1;
for (size_t i = 0; i < dm.size(); ++i) {
sz *= dm[i];
}
tmpSrc = Matrix::create(sz, 1, false, false);
tmpSrc->copyFrom((real*)srcData, sz);
srcData = tmpSrc->getData();
}
auto dtype = this->getDtype();
auto srcMD = memory::desc(dm, dtype, srcFmt);
auto dstMD = memory::desc(dm, dtype, dstFmt);
auto eg = this->getEngine();
auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
auto r = reorder(src, dst);
stream(stream::kind::eager).submit({r}).wait();
}
void MKLDNNMatrix::downSpatial() {
int fmt = getFormat();
if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
// only support nchw and oihw yet, later can support more like nhwc, ihwo
return;
}
// TODO(TJ): change H(height) and W(width) if support nhwc or more
const int H = 2, W = 3;
memory::dims srcDims = getDims();
if (srcDims[H] != 1 || srcDims[W] != 1) {
// can not down spatial
return;
}
memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
memory::format dstFmt;
switch (fmt) {
case memory::format::nchw:
dstFmt = memory::format::nc;
break;
case memory::format::oihw:
dstFmt = memory::format::oi;
break;
default:
LOG(FATAL) << "unsupported format";
}
memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
mkldnn_primitive_t result;
mkldnn::error::wrap_c_api(
mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
"could not create a memory primitive");
reset(result);
set_data_handle(getData());
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "Matrix.h"
#include "mkldnn.hpp"
#include "paddle/parameter/Parameter.h"
namespace paddle {
class MKLDNNMatrix;
typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
/**
* @brief MKLDNN Matrix.
*
*/
class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
public:
MKLDNNMatrix(real* data,
size_t height,
size_t width,
mkldnn::memory::primitive_desc pd)
: CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
~MKLDNNMatrix() {}
/**
* Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
*/
static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
/**
* Create MKLDNNMatrix from a MatrixPtr and memory details info
*/
static MKLDNNMatrixPtr create(
MatrixPtr m,
mkldnn::memory::dims dims,
mkldnn::memory::format fmt,
mkldnn::engine& eg,
mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
public:
/**
* Reorder this MKLDNNMatrix from other format.
* Support inplace reorder.
* @note: this function would only reorder the data layout.
* will NOT change this original dim or format info
*/
void reorderDataFrom(const MKLDNNMatrixPtr& m,
memory::format srcFmt,
memory::dims targetDim);
/**
* Reorder this MKLDNNMatrix to other format.
* Support inplace reorder.
* @note: this function would only reorder the data layout.
* will NOT change the dst dim or format info
*/
void reorderDataTo(const MKLDNNMatrixPtr& m,
memory::format dstFmt,
memory::dims targetDim);
/**
* Dimensionality reduction.
* Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
*/
void downSpatial();
/**
* Update the memory data handle.
* Caution: This will not check the buffer size of the data,
* it should be coverd by user.
*/
void updateData(void* data) { set_data_handle(data); }
/**
* Get primitive descriptor.
*/
mkldnn::memory::primitive_desc getPrimitiveDesc() {
return this->get_primitive_desc();
}
/**
* Get memory descriptor.
*/
mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
/**
* Get dimensions.
*/
mkldnn::memory::dims getDims() {
mkldnn::memory::desc md = getMemoryDesc();
const int* src = md.data.dims;
int ndims = md.data.ndims;
mkldnn::memory::dims dst;
dst.resize(ndims);
for (int i = 0; i < ndims; ++i) {
dst[i] = src[i];
}
return dst;
}
/**
* Get format.
*/
mkldnn::memory::format getFormat() {
return (mkldnn::memory::format)(getMemoryDesc().data.format);
}
/**
* Get memory data type.
*/
mkldnn::memory::data_type getDtype() {
return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
}
/**
* Get engine.
*/
mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
protected:
/**
* Do reorder once.
* Can support inplace.
*/
void reorderOnce(void* srcData,
void* dstData,
memory::format srcFmt,
memory::format dstFmt,
memory::dims dm);
};
} // namespace paddle
...@@ -281,7 +281,11 @@ public: ...@@ -281,7 +281,11 @@ public:
/** /**
* @brief Set the format in header. * @brief Set the format in header.
*/ */
void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; } void setHeaderFormat(int32_t fmt) {
CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
<< fmt;
headerFormat_ = fmt;
}
/** /**
* @brief Parameter Update Hook. * @brief Parameter Update Hook.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册