diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 2c382a6d4f3761e3f994936368de2775510c7111..31867477c3278f6c69bd851996199e0e9f123e75 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -67,8 +67,14 @@ protected: // merge grad primitive std::shared_ptr mergeGrad_; + std::vector pipelineMergeGrad_; // tmp input argument to save input grad, only used to merge grad Argument tmpInArg_; + // since mkldnn sum do not support different formats: + // can refer to https://github.com/01org/mkl-dnn/issues/134 + // so need create reorder manually and save tmp MKLDNNMatrix + MKLDNNMatrixPtr tmpOutGrad_; + std::shared_ptr tmpCvt_; public: explicit MKLDNNLayer(const LayerConfig& config) @@ -148,9 +154,17 @@ public: if (needResetBwd_) { VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; pipelineBwd_.clear(); + pipelineMergeGrad_.clear(); + mergeGrad_ = nullptr; resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); needResetBwd_ = false; } + + // merge grad must before backward activation + if (mergeGrad_) { + REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str()); + stream_->submit(pipelineMergeGrad_); + } { REGISTER_TIMER_INFO("BpActTimer", getName().c_str()); backwardActivation(); @@ -262,6 +276,7 @@ protected: mkldnn::memory::primitive_desc pd) { CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet"; mergeGrad_ = nullptr; + pipelineMergeGrad_.clear(); out = MKLDNNMatrix::create(output_.grad, pd); if (outputMap_.size() <= 1) { return; @@ -272,6 +287,7 @@ protected: for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { MKLDNNMatrixPtr src = std::dynamic_pointer_cast(it->second->grad); + VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first; CHECK(src) << "should be MKLDNNMatrix"; auto srcDims = src->getDims(); auto dstDims = out->getDims(); @@ -283,9 +299,26 @@ protected: srcs.push_back(*src); scales.push_back(1.0); } - auto sumPD = mkldnn::sum::primitive_desc(pd.desc(), scales, srcPDs); - mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *out)); - pipelineBwd_.insert(pipelineBwd_.begin(), *mergeGrad_); + + // TODO(TJ): remove me when mkldnn sum support different formats + for (size_t i = 1; i < srcPDs.size(); ++i) { + CHECK(srcPDs[0] == srcPDs[i]); + } + tmpOutGrad_ = nullptr; + tmpCvt_ = nullptr; + if (out->getPrimitiveDesc() != srcPDs[0]) { + tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]); + tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); + CHECK(tmpCvt_); + pipelineMergeGrad_.push_back(*tmpCvt_); + } else { + tmpOutGrad_ = out; + } + + auto sumPD = mkldnn::sum::primitive_desc( + tmpOutGrad_->getMemoryDesc(), scales, srcPDs); + mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_)); + pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); } /** @@ -299,7 +332,7 @@ protected: const MatrixPtr& grad = input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad; in = MKLDNNMatrix::create(grad, pd); - auto arg = input->getOutput(this->getName()); + Argument& arg = input->getOutput(this->getName()); arg.grad = std::dynamic_pointer_cast(in); } diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt index 066837ca959e46dbe3b39c661aa1bab11cbf2734..db87ea205316426703d8d9451bd0896064a3bd8a 100644 --- a/paddle/trainer/tests/CMakeLists.txt +++ b/paddle/trainer/tests/CMakeLists.txt @@ -48,6 +48,13 @@ if(WITH_MKLDNN) --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False --use_gpu=False WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) + add_test(NAME test_CompareMKLDNNandCPU_Banches + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU + --config_file_a=trainer/tests/sample_trainer_config_branch_net.conf --use_mkldnn_a=True + --config_file_b=trainer/tests/sample_trainer_config_branch_net.conf --use_mkldnn_b=False + --use_gpu=False + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) endif() ############### test_CompareTwoOpts ################### diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf new file mode 100644 index 0000000000000000000000000000000000000000..c2594bc13c250a877a7b8a77e11405671c4d8907 --- /dev/null +++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf @@ -0,0 +1,103 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +################################### Data Configuration ################################### +TrainData(ProtoData(files = "trainer/tests/mnist.list")) +################################### Algorithm Configuration ################################### +settings(batch_size = 256, + learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) +################################### Network Configuration ################################### +data = data_layer(name ="input", size=784) + +tmp = img_conv_layer(input=data, + num_channels=1, + filter_size=3, + num_filters=32, + padding=1, + shared_biases=True, + act=ReluActivation()) + +a1 = img_conv_layer(input=tmp, + filter_size=1, + num_filters=32, + padding=0, + shared_biases=True, + act=ReluActivation()) + +a2 = img_conv_layer(input=tmp, + filter_size=3, + num_filters=32, + padding=1, + shared_biases=True, + act=ReluActivation()) + +tmp = concat_layer(input=[a1, a2]) + +tmp = img_pool_layer(input=tmp, + num_channels=64, + pool_size=3, + stride=2, + padding=1, + pool_type=AvgPooling()) + +b1 = img_conv_layer(input=tmp, + filter_size=3, + num_filters=64, + padding=1, + shared_biases=True, + act=ReluActivation()) + +b1 = img_pool_layer(input=b1, + pool_size=3, + stride=1, + padding=1, + pool_type=MaxPooling()) + +b2 = img_conv_layer(input=tmp, + filter_size=5, + num_filters=64, + padding=2, + shared_biases=True, + act=ReluActivation()) + +b2 = img_pool_layer(input=b2, + pool_size=5, + stride=1, + padding=2, + pool_type=MaxPooling()) + +tmp = addto_layer(input=[b1, b2], + act=ReluActivation(), + bias_attr=False) + +tmp = img_pool_layer(input=tmp, + pool_size=3, + stride=2, + padding=1, + pool_type=MaxPooling()) + +tmp = fc_layer(input=tmp, size=64, + bias_attr=False, + act=TanhActivation()) + +output = fc_layer(input=tmp, size=10, + bias_attr=True, + act=SoftmaxActivation()) + +lbl = data_layer(name ="label", size=10) + +cost = classification_cost(input=output, label=lbl) +outputs(cost)