diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h index ec4d093e0cac9a766d17a36827affd2c08e1c618..edef36194aabdb9c122ec3423deb036169a34d7c 100644 --- a/paddle/gserver/layers/Layer.h +++ b/paddle/gserver/layers/Layer.h @@ -82,6 +82,7 @@ protected: Argument output_; /// Several outputs stored on different devices, used in 'parallel_nn' case, /// and record them by deviceId_. + /// Also used in 'use_mkldnn' case. std::vector outputOtherDevice_; /// If there are several outputs, map them by each name. std::map outputMap_; @@ -177,6 +178,13 @@ protected: return inputLayer.getOutput(deviceId_); } + /** + * Get the argument of input layer with deviceId. + */ + const Argument& getInput(size_t inputIndex, int deviceId) const { + return inputLayers_[inputIndex]->getOutput(deviceId); + } + /** * Get the forward-input value. */ @@ -191,6 +199,13 @@ protected: return inputLayer.getOutput(deviceId_).value; } + /** + * Get the forward-input value with deviceId. + */ + const MatrixPtr& getInputValue(int inputIndex, int deviceId) { + return inputLayers_[inputIndex]->getOutput(deviceId).value; + } + /** * Get the forward-input grad. */ @@ -205,6 +220,13 @@ protected: return inputLayer.getOutput(deviceId_).grad; } + /** + * Get the forward-input grad. + */ + const MatrixPtr& getInputGrad(int inputIndex, int deviceId) { + return inputLayers_[inputIndex]->getOutput(deviceId).grad; + } + /** * Get the forward-input label. */ @@ -326,19 +348,6 @@ public: if (deviceId == getDeviceId()) { return output_; } else { - bool CPU2MKLDNN = - getDeviceId() == CPU_DEVICE && deviceId == MKLDNN_DEVICE; - bool MKLDNN2CPU = - getDeviceId() == MKLDNN_DEVICE && deviceId == CPU_DEVICE; - if (CPU2MKLDNN) { - // TODO: do something - return output_; - } else if (MKLDNN2CPU) { - // TODO: do something - return output_; - } - - // TODO: handle mkldnn device or add mkldnn device to other for (size_t i = 0; i < outputOtherDevice_.size(); i++) { if (outputOtherDevice_[i].deviceId == deviceId) { return outputOtherDevice_[i]; diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp index 54631044696329517acae66484285af81a87c708..a3291e6a8fb755286427942e340fbd40c73350ad 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.cpp +++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp @@ -97,7 +97,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() { } void MKLDNNFcLayer::reshape() { - const Argument& input = getInput(0); + const Argument& input = getInput(0, getPrev(0)->getDeviceId()); int batchSize = input.getBatchSize(); if (bs_ == batchSize) { return; @@ -135,35 +135,43 @@ void MKLDNNFcLayer::reshape() { void MKLDNNFcLayer::resetFwd() { bool hasBias = biases_ && biases_->getW(); - const MatrixPtr& in = getInputValue(0); const MatrixPtr& wgt = weight_->getW(); const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr; const MatrixPtr& out = output_.value; - if (getPrev(0)->getDeviceId() == MKLDNN_DEVICE) { + if (prevIsMKLDNN()) { + const MatrixPtr& in = getInputValue(0); inVal_ = std::dynamic_pointer_cast(in); CHECK(inVal_) << "Input should be MKLDNNMatrix"; - // TODO: change input nchw to nc if available - // inVal_->downSpatial() } else { + CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; + const MatrixPtr& in = getInputValue(0, CPU_DEVICE); inVal_ = MKLDNNMatrix::create( - in, - hasSpatial_ ? memory::dims{bs_, ic_, ih_, iw_} : memory::dims{bs_, ic_}, - hasSpatial_ ? format::nchw : format::nc, - engine_); + in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_); } - + inVal_->downSpatial(); wgtVal_ = MKLDNNMatrix::create( - wgt, - hasSpatial_ ? memory::dims{oc_, ic_, ih_, iw_} : memory::dims{oc_, ic_}, - hasSpatial_ ? format::oihw : format::oi, - engine_); + wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_); + wgtVal_->downSpatial(); biasVal_ = hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr; outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_); - // change original output to mkldnn output + // change original output value to mkldnn output value output_.value = std::dynamic_pointer_cast(outVal_); + if (!nextIsMKLDNN()) { + Argument cpuOutput; + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + if (outputOtherDevice_[i].deviceId == CPU_DEVICE) { + cpuOutput = outputOtherDevice_[i]; + } + } + cpuOutput.setFrameHeight(output_.getFrameHeight()); + cpuOutput.setFrameWidth(output_.getFrameWidth()); + + // fc cpu output value do not need convert + cpuOutput.value = output_.value; + } // create forward handle prop_kind pk = prop_kind::forward; @@ -176,12 +184,13 @@ void MKLDNNFcLayer::resetFwd() { : fc_fwd::desc( pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD()); fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); - if (hasBias) { fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_)); } else { fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_)); } + printValueFormatFlow(); + pipelineFwd_.clear(); pipelineFwd_.push_back(*fwd_); } @@ -197,17 +206,24 @@ void MKLDNNFcLayer::resetBwd() { CHECK(inVal_) << "Should have input value"; const MatrixPtr& wgt = weight_->getWGrad(); const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr; - const MatrixPtr& out = output_.grad; - wgtGrad_ = MKLDNNMatrix::create( - wgt, wgtVal_->getDims(), wgtVal_->getFormat(), engine_); - biasGrad_ = - hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr; + if (nextIsMKLDNN()) { + // can not directly cast outputgrad to mkldnnmatrix, + // since each layer can not write the inputgrad to mkldnn inputgrad. + // So just create from matrix with outputvalue format. + const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad; + outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD()); + // TODO: maybe need merge topdiffs + } else { + // TODO: merge topdiffs + const MatrixPtr& out = getOutput(CPU_DEVICE).grad; + // fc do not need to convert from cpu device since output always nc + // only need create from cpu device + outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD()); + } - outGrad_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_); - // change original output to mkldnn output - // TODO: right? - output_.grad = std::dynamic_pointer_cast(outGrad_); + wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD()); + biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr; // create memory primitive desc fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, @@ -235,21 +251,38 @@ void MKLDNNFcLayer::resetBwd() { pipelineBwd_.push_back(*bwdWgt_); /// backward data - const MatrixPtr& in = getInputGrad(0); - if (in == nullptr) { - return; + if (prevIsMKLDNN()) { + const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE); + if (in == nullptr) { + return; + } + if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) { + // TODO: many mkldnn bots + // add sum handle + } else { + inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD()); + } + } else { + const MatrixPtr& in = getInputGrad(0, CPU_DEVICE); + if (in == nullptr) { + return; + } + if (getInput(0, CPU_DEVICE).getAllCount() > 1) { + // TODO: many bots + // add sum handle + } else { + inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD()); + } } + fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD()); fc_bwdData::primitive_desc bwdDataPD = fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD); - // TODO: check right, just from ingrad? - inGrad_ = - MKLDNNMatrix::create(in, inVal_->getDims(), inVal_->getFormat(), engine_); - CHECK(wgtVal_) << "Should have weight memory"; bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_)); + printGradFormatFlow(); pipelineBwd_.push_back(*bwdData_); } @@ -259,11 +292,7 @@ void MKLDNNFcLayer::forward(PassType passType) { { REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); - - // update input data - // since it might be changed if this is after data layer - real* iData = getInputValue(0)->getData(); - inVal_->updateData(iData); + syncInputValue(); // just submit forward pipeline stream_->submit(pipelineFwd_); @@ -285,10 +314,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) { REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); resetBwd(); - // update diff - real* oDiff = getOutputGrad()->getData(); - outGrad_->updateData(oDiff); - + syncOutputGrad(); // just sumbmit backward pipeline stream_->submit(pipelineBwd_); } diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index fbd62d9aaa306e60265caeece9ace19aa5694256..3dd17a36ff7eef18b0a451c73c2a7eb38cde519f 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -125,23 +125,80 @@ public: << ", oh: " << oh_ << ", ow: " << ow_; } - // TODO(TJ): move to MkldnnMatrix - // create memory desc - inline mkldnn::memory::desc createMD( - mkldnn::memory::dims dims, - mkldnn::memory::format fmt, - mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) { - // TODO(TJ): isFmtSuppoted(fmt) - return mkldnn::memory::desc(dims, type, fmt); + /** + * Print the mkldnn memory format flow of value + */ + virtual void printValueFormatFlow() { + if (inVal_ && outVal_) { + VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat() + << " >>> " << outVal_->getFormat(); + } } - void resetMKLDNNOutput(size_t height, size_t width) { - Layer::resetOutput(height, width); - // get valu and grad, use mkldnn matrix instaed - // output_.value; + /** + * Print the mkldnn memory format flow of grad + */ + virtual void printGradFormatFlow() { + if (inGrad_ && outGrad_) { + VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat() + << " <<< " << outGrad_->getFormat(); + } } protected: + /** + * If next layer only has MKLDNN type. + * Otherwise, only support otherdevice CPU device. + */ + bool nextIsMKLDNN() { + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE) + << "Only support other device is CPU yet"; + } + return outputOtherDevice_.size() == 0; + } + + /** + * Is previous layer MKLDNN type. + * Otherwise, only support otherdevice CPU device. + */ + bool prevIsMKLDNN(int index = 0) { + int prevDevice = getPrev(index)->getDeviceId(); + if (prevDevice == MKLDNN_DEVICE) { + return true; + } else { + // do not support GPU yet + CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet"; + return false; + } + } + + /** + * Sync input value data + */ + void syncInputValue() { + if (prevIsMKLDNN()) { + return; + } + real* iData = getInputValue(0, CPU_DEVICE)->getData(); + // update input data + // since it might be changed if this is after data layer + inVal_->updateData(iData); + } + + /** + * Sync output grad data + */ + void syncOutputGrad() { + if (nextIsMKLDNN()) { + return; + } + + // update diff + real* oDiff = getOutput(CPU_DEVICE).grad->getData(); + outGrad_->updateData(oDiff); + } + /** * Set deviceId of this layer. */ diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h index 666a8b8368e3e2ebc522902c176d7491d2920d2a..94ef561f066a127496e2849a419835e175c526d7 100644 --- a/paddle/math/Allocator.h +++ b/paddle/math/Allocator.h @@ -48,7 +48,13 @@ public: */ virtual void* alloc(size_t size) { void* ptr; +#ifdef PADDLE_USE_MKLDNN + // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp + // memory alignment + CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0); +#else CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0); +#endif CHECK(ptr) << "Fail to allocate CPU memory: size=" << size; return ptr; } diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp index 44fc54278c993aa8b35d34d801f5684ddf397dc4..24d54ec0f73136b2559705feb52262a7f90b8fc7 100644 --- a/paddle/math/MKLDNNMatrix.cpp +++ b/paddle/math/MKLDNNMatrix.cpp @@ -18,29 +18,74 @@ using namespace mkldnn; // NOLINT namespace paddle { -MKLDNNMatrixPtr MKLDNNMatrix::create(const MatrixPtr& m, - memory::dims dims, - memory::format fmt, - engine& eg, - mkldnn::memory::data_type dtype) { - CpuMatrixPtr cpuM = std::dynamic_pointer_cast(m); - CHECK(cpuM) << "Only support create from CPU matrix yet"; - - size_t ndims = dims.size(); +MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) { + memory::desc md = pd.desc(); + size_t ndims = md.data.ndims; + int* dims = md.data.dims; CHECK(ndims > 0) << "Input dims should not be empty"; - size_t cnt = 1; + size_t cnts = 1; for (size_t i = 0; i < ndims; ++i) { - cnt *= dims[i]; + cnts *= dims[i]; } - CHECK_EQ(cnt, m->getElementCnt()) << "Count size does not match"; + if (m == nullptr) { + size_t height = dims[0]; + size_t width = cnts / dims[0]; + // LOG(INFO) << height << "," << width; + m = Matrix::create(height, width, false, false); + } + + CHECK(m) << " Matrix should not be empty"; + CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast(m); + CHECK(cpuMatrix) << "Only support create from CPU matrix yet"; + + CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match"; size_t width = m->getWidth(); size_t height = m->getHeight(); real* data = m->getData(); + return std::make_shared(data, height, width, pd); +} +MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, + memory::dims dims, + memory::format fmt, + engine& eg, + mkldnn::memory::data_type dtype) { memory::desc md = memory::desc(dims, dtype, fmt); memory::primitive_desc pd = memory::primitive_desc(md, eg); - return std::make_shared(data, height, width, pd); + return create(m, pd); +} + +void MKLDNNMatrix::downSpatial() { + int fmt = getFormat(); + if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) { + // only support nchw and oihw yet, later can support more like nhwc, ihwo + return; + } + + memory::dims srcDims = getDims(); + const int H = 2, W = 3; + if (srcDims[H] != 1 || srcDims[W] != 1) { + // can not down spatial + return; + } + + memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]}; + memory::format dstFmt; + switch (fmt) { + case memory::format::nchw: + dstFmt = memory::format::nc; + break; + case memory::format::oihw: + dstFmt = memory::format::oi; + break; + default: + LOG(FATAL) << "unsupported format"; + } + memory::desc md = memory::desc(dstDims, getDtype(), dstFmt); + memory::primitive_desc pd = memory::primitive_desc(md, getEngine()); + void* data = getData(); + memory(pd, data); } } // namespace paddle diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h index 54c0a1fdcbc477643848b6374653de15f252bf50..05adc867c2076511da22834a73e0665c4e5cfc68 100644 --- a/paddle/math/MKLDNNMatrix.h +++ b/paddle/math/MKLDNNMatrix.h @@ -39,20 +39,37 @@ public: mkldnn::memory::primitive_desc pd) : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {} - MKLDNNMatrix(size_t height, size_t width, mkldnn::memory::primitive_desc pd) - : CpuMatrix(height, width, false), mkldnn::memory(pd) { - set_data_handle(CpuMatrix::getData()); - } - ~MKLDNNMatrix() {} + /** + * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc + */ + static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd); + + /** + * Create MKLDNNMatrix from a MatrixPtr and memory details info + */ static MKLDNNMatrixPtr create( - const MatrixPtr& m, + MatrixPtr m, mkldnn::memory::dims dims, mkldnn::memory::format fmt, mkldnn::engine& eg, mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32); +public: + /** + * Dimensionality reduction. + * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1 + */ + void downSpatial(); + + /** + * Update the memory data handle. + * Caution: This will not check the buffer size of the data, + * it should be coverd by user. + */ + void updateData(void* data) { set_data_handle(data); } + /** * Get primitive descriptor. */ @@ -64,12 +81,13 @@ public: mkldnn::memory::desc getMD() { return getPD().desc(); } /** - * Get dims. + * Get dimensions. */ mkldnn::memory::dims getDims() { + mkldnn::memory::desc md = getMD(); + const int* src = md.data.dims; + int ndims = md.data.ndims; mkldnn::memory::dims dst; - int* src = getMD().data.dims; - int ndims = getMD().data.ndims; dst.resize(ndims); for (int i = 0; i < ndims; ++i) { dst[i] = src[i]; @@ -85,11 +103,16 @@ public: } /** - * Update the memory data handle. - * Caution: This will not check the buffer size of the data, - * it should be coverd by user. + * Get memory data type. */ - void updateData(void* data) { set_data_handle(data); } + mkldnn::memory::data_type getDtype() { + return (mkldnn::memory::data_type)(getMD().data.data_type); + } + + /** + * Get engine. + */ + mkldnn::engine getEngine() { return getPD().get_engine(); } }; } // namespace paddle