提交 48d87e5e 编写于 作者: T tensor-tang

pass test, support input CPU device

上级 4eecd0c2
......@@ -82,6 +82,7 @@ protected:
Argument output_;
/// Several outputs stored on different devices, used in 'parallel_nn' case,
/// and record them by deviceId_.
/// Also used in 'use_mkldnn' case.
std::vector<Argument> outputOtherDevice_;
/// If there are several outputs, map them by each name.
std::map<std::string, Argument*> outputMap_;
......@@ -177,6 +178,13 @@ protected:
return inputLayer.getOutput(deviceId_);
}
/**
* Get the argument of input layer with deviceId.
*/
const Argument& getInput(size_t inputIndex, int deviceId) const {
return inputLayers_[inputIndex]->getOutput(deviceId);
}
/**
* Get the forward-input value.
*/
......@@ -191,6 +199,13 @@ protected:
return inputLayer.getOutput(deviceId_).value;
}
/**
* Get the forward-input value with deviceId.
*/
const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).value;
}
/**
* Get the forward-input grad.
*/
......@@ -205,6 +220,13 @@ protected:
return inputLayer.getOutput(deviceId_).grad;
}
/**
* Get the forward-input grad.
*/
const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).grad;
}
/**
* Get the forward-input label.
*/
......@@ -326,19 +348,6 @@ public:
if (deviceId == getDeviceId()) {
return output_;
} else {
bool CPU2MKLDNN =
getDeviceId() == CPU_DEVICE && deviceId == MKLDNN_DEVICE;
bool MKLDNN2CPU =
getDeviceId() == MKLDNN_DEVICE && deviceId == CPU_DEVICE;
if (CPU2MKLDNN) {
// TODO: do something
return output_;
} else if (MKLDNN2CPU) {
// TODO: do something
return output_;
}
// TODO: handle mkldnn device or add mkldnn device to other
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].deviceId == deviceId) {
return outputOtherDevice_[i];
......
......@@ -97,7 +97,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
}
void MKLDNNFcLayer::reshape() {
const Argument& input = getInput(0);
const Argument& input = getInput(0, getPrev(0)->getDeviceId());
int batchSize = input.getBatchSize();
if (bs_ == batchSize) {
return;
......@@ -135,35 +135,43 @@ void MKLDNNFcLayer::reshape() {
void MKLDNNFcLayer::resetFwd() {
bool hasBias = biases_ && biases_->getW();
const MatrixPtr& in = getInputValue(0);
const MatrixPtr& wgt = weight_->getW();
const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
const MatrixPtr& out = output_.value;
if (getPrev(0)->getDeviceId() == MKLDNN_DEVICE) {
if (prevIsMKLDNN()) {
const MatrixPtr& in = getInputValue(0);
inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
CHECK(inVal_) << "Input should be MKLDNNMatrix";
// TODO: change input nchw to nc if available
// inVal_->downSpatial()
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
inVal_ = MKLDNNMatrix::create(
in,
hasSpatial_ ? memory::dims{bs_, ic_, ih_, iw_} : memory::dims{bs_, ic_},
hasSpatial_ ? format::nchw : format::nc,
engine_);
in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
inVal_->downSpatial();
wgtVal_ = MKLDNNMatrix::create(
wgt,
hasSpatial_ ? memory::dims{oc_, ic_, ih_, iw_} : memory::dims{oc_, ic_},
hasSpatial_ ? format::oihw : format::oi,
engine_);
wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
wgtVal_->downSpatial();
biasVal_ =
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
// change original output to mkldnn output
// change original output value to mkldnn output value
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
if (!nextIsMKLDNN()) {
Argument cpuOutput;
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
cpuOutput = outputOtherDevice_[i];
}
}
cpuOutput.setFrameHeight(output_.getFrameHeight());
cpuOutput.setFrameWidth(output_.getFrameWidth());
// fc cpu output value do not need convert
cpuOutput.value = output_.value;
}
// create forward handle
prop_kind pk = prop_kind::forward;
......@@ -176,12 +184,13 @@ void MKLDNNFcLayer::resetFwd() {
: fc_fwd::desc(
pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
if (hasBias) {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
} else {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
}
printValueFormatFlow();
pipelineFwd_.clear();
pipelineFwd_.push_back(*fwd_);
}
......@@ -197,17 +206,24 @@ void MKLDNNFcLayer::resetBwd() {
CHECK(inVal_) << "Should have input value";
const MatrixPtr& wgt = weight_->getWGrad();
const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
const MatrixPtr& out = output_.grad;
wgtGrad_ = MKLDNNMatrix::create(
wgt, wgtVal_->getDims(), wgtVal_->getFormat(), engine_);
biasGrad_ =
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
if (nextIsMKLDNN()) {
// can not directly cast outputgrad to mkldnnmatrix,
// since each layer can not write the inputgrad to mkldnn inputgrad.
// So just create from matrix with outputvalue format.
const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
// TODO: maybe need merge topdiffs
} else {
// TODO: merge topdiffs
const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
// fc do not need to convert from cpu device since output always nc
// only need create from cpu device
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
}
outGrad_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
// change original output to mkldnn output
// TODO: right?
output_.grad = std::dynamic_pointer_cast<Matrix>(outGrad_);
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
// create memory primitive desc
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
......@@ -235,21 +251,38 @@ void MKLDNNFcLayer::resetBwd() {
pipelineBwd_.push_back(*bwdWgt_);
/// backward data
const MatrixPtr& in = getInputGrad(0);
if (in == nullptr) {
return;
if (prevIsMKLDNN()) {
const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
if (in == nullptr) {
return;
}
if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
// TODO: many mkldnn bots
// add sum handle
} else {
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
}
} else {
const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
if (in == nullptr) {
return;
}
if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
// TODO: many bots
// add sum handle
} else {
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
}
}
fc_bwdData::desc bwdDataDesc =
fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
fc_bwdData::primitive_desc bwdDataPD =
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
// TODO: check right, just from ingrad?
inGrad_ =
MKLDNNMatrix::create(in, inVal_->getDims(), inVal_->getFormat(), engine_);
CHECK(wgtVal_) << "Should have weight memory";
bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
printGradFormatFlow();
pipelineBwd_.push_back(*bwdData_);
}
......@@ -259,11 +292,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
// update input data
// since it might be changed if this is after data layer
real* iData = getInputValue(0)->getData();
inVal_->updateData(iData);
syncInputValue();
// just submit forward pipeline
stream_->submit(pipelineFwd_);
......@@ -285,10 +314,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
resetBwd();
// update diff
real* oDiff = getOutputGrad()->getData();
outGrad_->updateData(oDiff);
syncOutputGrad();
// just sumbmit backward pipeline
stream_->submit(pipelineBwd_);
}
......
......@@ -125,23 +125,80 @@ public:
<< ", oh: " << oh_ << ", ow: " << ow_;
}
// TODO(TJ): move to MkldnnMatrix
// create memory desc
inline mkldnn::memory::desc createMD(
mkldnn::memory::dims dims,
mkldnn::memory::format fmt,
mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
// TODO(TJ): isFmtSuppoted(fmt)
return mkldnn::memory::desc(dims, type, fmt);
/**
* Print the mkldnn memory format flow of value
*/
virtual void printValueFormatFlow() {
if (inVal_ && outVal_) {
VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
<< " >>> " << outVal_->getFormat();
}
}
void resetMKLDNNOutput(size_t height, size_t width) {
Layer::resetOutput(height, width);
// get valu and grad, use mkldnn matrix instaed
// output_.value;
/**
* Print the mkldnn memory format flow of grad
*/
virtual void printGradFormatFlow() {
if (inGrad_ && outGrad_) {
VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
<< " <<< " << outGrad_->getFormat();
}
}
protected:
/**
* If next layer only has MKLDNN type.
* Otherwise, only support otherdevice CPU device.
*/
bool nextIsMKLDNN() {
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
<< "Only support other device is CPU yet";
}
return outputOtherDevice_.size() == 0;
}
/**
* Is previous layer MKLDNN type.
* Otherwise, only support otherdevice CPU device.
*/
bool prevIsMKLDNN(int index = 0) {
int prevDevice = getPrev(index)->getDeviceId();
if (prevDevice == MKLDNN_DEVICE) {
return true;
} else {
// do not support GPU yet
CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
return false;
}
}
/**
* Sync input value data
*/
void syncInputValue() {
if (prevIsMKLDNN()) {
return;
}
real* iData = getInputValue(0, CPU_DEVICE)->getData();
// update input data
// since it might be changed if this is after data layer
inVal_->updateData(iData);
}
/**
* Sync output grad data
*/
void syncOutputGrad() {
if (nextIsMKLDNN()) {
return;
}
// update diff
real* oDiff = getOutput(CPU_DEVICE).grad->getData();
outGrad_->updateData(oDiff);
}
/**
* Set deviceId of this layer.
*/
......
......@@ -48,7 +48,13 @@ public:
*/
virtual void* alloc(size_t size) {
void* ptr;
#ifdef PADDLE_USE_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment
CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
#else
CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
#endif
CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
return ptr;
}
......
......@@ -18,29 +18,74 @@ using namespace mkldnn; // NOLINT
namespace paddle {
MKLDNNMatrixPtr MKLDNNMatrix::create(const MatrixPtr& m,
memory::dims dims,
memory::format fmt,
engine& eg,
mkldnn::memory::data_type dtype) {
CpuMatrixPtr cpuM = std::dynamic_pointer_cast<CpuMatrix>(m);
CHECK(cpuM) << "Only support create from CPU matrix yet";
size_t ndims = dims.size();
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
memory::desc md = pd.desc();
size_t ndims = md.data.ndims;
int* dims = md.data.dims;
CHECK(ndims > 0) << "Input dims should not be empty";
size_t cnt = 1;
size_t cnts = 1;
for (size_t i = 0; i < ndims; ++i) {
cnt *= dims[i];
cnts *= dims[i];
}
CHECK_EQ(cnt, m->getElementCnt()) << "Count size does not match";
if (m == nullptr) {
size_t height = dims[0];
size_t width = cnts / dims[0];
// LOG(INFO) << height << "," << width;
m = Matrix::create(height, width, false, false);
}
CHECK(m) << " Matrix should not be empty";
CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
size_t width = m->getWidth();
size_t height = m->getHeight();
real* data = m->getData();
return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
}
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
memory::dims dims,
memory::format fmt,
engine& eg,
mkldnn::memory::data_type dtype) {
memory::desc md = memory::desc(dims, dtype, fmt);
memory::primitive_desc pd = memory::primitive_desc(md, eg);
return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
return create(m, pd);
}
void MKLDNNMatrix::downSpatial() {
int fmt = getFormat();
if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
// only support nchw and oihw yet, later can support more like nhwc, ihwo
return;
}
memory::dims srcDims = getDims();
const int H = 2, W = 3;
if (srcDims[H] != 1 || srcDims[W] != 1) {
// can not down spatial
return;
}
memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
memory::format dstFmt;
switch (fmt) {
case memory::format::nchw:
dstFmt = memory::format::nc;
break;
case memory::format::oihw:
dstFmt = memory::format::oi;
break;
default:
LOG(FATAL) << "unsupported format";
}
memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
void* data = getData();
memory(pd, data);
}
} // namespace paddle
......@@ -39,20 +39,37 @@ public:
mkldnn::memory::primitive_desc pd)
: CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
MKLDNNMatrix(size_t height, size_t width, mkldnn::memory::primitive_desc pd)
: CpuMatrix(height, width, false), mkldnn::memory(pd) {
set_data_handle(CpuMatrix::getData());
}
~MKLDNNMatrix() {}
/**
* Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
*/
static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
/**
* Create MKLDNNMatrix from a MatrixPtr and memory details info
*/
static MKLDNNMatrixPtr create(
const MatrixPtr& m,
MatrixPtr m,
mkldnn::memory::dims dims,
mkldnn::memory::format fmt,
mkldnn::engine& eg,
mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
public:
/**
* Dimensionality reduction.
* Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
*/
void downSpatial();
/**
* Update the memory data handle.
* Caution: This will not check the buffer size of the data,
* it should be coverd by user.
*/
void updateData(void* data) { set_data_handle(data); }
/**
* Get primitive descriptor.
*/
......@@ -64,12 +81,13 @@ public:
mkldnn::memory::desc getMD() { return getPD().desc(); }
/**
* Get dims.
* Get dimensions.
*/
mkldnn::memory::dims getDims() {
mkldnn::memory::desc md = getMD();
const int* src = md.data.dims;
int ndims = md.data.ndims;
mkldnn::memory::dims dst;
int* src = getMD().data.dims;
int ndims = getMD().data.ndims;
dst.resize(ndims);
for (int i = 0; i < ndims; ++i) {
dst[i] = src[i];
......@@ -85,11 +103,16 @@ public:
}
/**
* Update the memory data handle.
* Caution: This will not check the buffer size of the data,
* it should be coverd by user.
* Get memory data type.
*/
void updateData(void* data) { set_data_handle(data); }
mkldnn::memory::data_type getDtype() {
return (mkldnn::memory::data_type)(getMD().data.data_type);
}
/**
* Get engine.
*/
mkldnn::engine getEngine() { return getPD().get_engine(); }
};
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册