Merge pull request #5049 from tensor-tang/mkldnn_bn

enable mkldnn_batch_norm

Merge pull request #5049 from tensor-tang/mkldnn_bn
enable mkldnn_batch_norm
b68f2d20 · Tao Luo · GitHub · 97fcaef0 · 5ba1e1e1 · b68f2d20
10 changed file
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNBatchNormLayer.h"
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+namespace paddle {
+REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
+const real MKLDNNBatchNormLayer::EPS = 1E-5;
+bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  // first one is input layer
+  // the other two are created in config_parser.py saving moving mean and var
+  CHECK_EQ(inputLayers_.size(), 3U);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  ic_ = conf.channels();
+  ih_ = inputLayers_[0]->getOutput().getFrameHeight();
+  iw_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (iw_ == 0 && ih_ == 0) {
+    iw_ = conf.img_size();
+    ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  }
+  oc_ = ic_;
+  oh_ = ih_;
+  ow_ = iw_;
+  if (config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+  movingAvgFraction_ = config_.moving_average_fraction();
+  VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
+                    << " --- global stats";
+  VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
+  initWeight();
+  movingMean_.reset(new Weight(oc_, 1, parameters_[1], 0));
+  movingVar_.reset(new Weight(oc_, 1, parameters_[2], 0));
+  return true;
+}
+void MKLDNNBatchNormLayer::initWeight() {
+  weight_.reset(new Weight(1, oc_, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+  }
+  CHECK_EQ(weight_ != nullptr, biases_ != nullptr)
+      << "only support have both weight and bias, or neither";
+  if (weight_ && weight_->getW()) {
+    CHECK(biases_ && biases_->getW());
+    valueScaleShift_ = Matrix::create(2, oc_, false, false);
+    valueScaleShift_->zeroMem();
+    VectorPtr scale(new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), 0));
+    VectorPtr shift(
+        new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), oc_));
+    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_VALUE);
+    scale->copyFrom(*wgt);
+    shift->copyFrom(*bias);
+    wgt->setData(valueScaleShift_->getData());
+    bias->setData(valueScaleShift_->getData() + oc_);
+  }
+  if (weight_ && weight_->getWGrad()) {
+    CHECK(biases_ && biases_->getWGrad());
+    gradScaleShift_ = Matrix::create(2, oc_, false, false);
+    gradScaleShift_->zeroMem();
+    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_GRADIENT);
+    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_GRADIENT);
+    wgt->setData(gradScaleShift_->getData());
+    bias->setData(gradScaleShift_->getData() + oc_);
+  }
+}
+void MKLDNNBatchNormLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+  // prepare mean and var if necessary
+  if (useGlobalStats_) {
+    CHECK(mean_);
+    CHECK(var_);
+    mean_->copyFrom(*(movingMean_->getW()));
+    var_->copyFrom(*(movingVar_->getW()));
+  }
+  hasInitedWgt_ = true;
+}
+void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
+  // calculating and saving moving mean and variance
+  CHECK_EQ(useGlobalStats_, false);
+  movingMean_->getW()->add(
+      *mean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  // here var is v^2
+  movingVar_->getW()->add(*var_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+}
+void MKLDNNBatchNormLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  oh = ih;
+  ow = ow;
+  // ic_ and oc can not be changed
+  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+      << "Input channel can not be changed";
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+  printSizeInfo();
+}
+void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
+                                    MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  // In training phase, it will always calculate mean and var,
+  // so useGlobalStats must be false.
+  // In scoring phase, it depends on useGlobalStats choice.
+  if (passType_ != PASS_TEST && useGlobalStats_ == true) {
+    LOG(WARNING) << "use_global_stats is invalid setting in training phase";
+    useGlobalStats_ = false;
+  }
+  resetFwdBuffers(in, wgt, out);
+  resetFwdPD(fwdPD_, in, wgt, out);
+  resetFwdPipeline(pipeline, fwdPD_, in, wgt, out);
+}
+void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
+                                    MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  std::shared_ptr<bn_bwd::primitive_desc> pd;
+  resetBwdBuffers(in, wgt, out);
+  resetBwdPD(pd, in, wgt, out);
+  resetBwdPipeline(pipeline, pd, in, wgt, out);
+}
+void MKLDNNBatchNormLayer::forward(PassType passType) {
+  MKLDNNLayer::forward(passType);
+  // calculate and save moving mean and variance
+  if (passType_ != PASS_TEST) {
+    calMovingMeanAndVar();
+  }
+}
+void MKLDNNBatchNormLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+void MKLDNNBatchNormLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                           MKLDNNMatrixPtr& wgt,
+                                           MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
+  if (valueScaleShift_) {
+    auto pd = MKLDNNMatrix::createPrimitiveDesc({2, oc_}, format::nc, engine_);
+    resetWithMatrix(wgt, valueScaleShift_, pd);
+  }
+  if (passType_ != PASS_TEST || useGlobalStats_) {
+    auto pd = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    mean_ = MKLDNNMatrix::create(pd);
+    var_ = MKLDNNMatrix::create(pd);
+  }
+}
+void MKLDNNBatchNormLayer::resetFwdPD(
+    std::shared_ptr<bn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr in,
+    MKLDNNMatrixPtr wgt,
+    MKLDNNMatrixPtr out) {
+  flags_ = 0u;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  if (useGlobalStats_) {
+    flags_ = (flags_ | batch_normalization_flag::use_global_stats);
+  }
+  if (wgt) {
+    flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
+  }
+  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
+  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
+  // TODO(TJ): use check macro
+  CHECK(out);
+  CHECK(out->getPrimitiveDesc() == pd->dst_primitive_desc());
+  if (wgt) {
+    CHECK(wgt->getPrimitiveDesc() == pd->weights_primitive_desc());
+  }
+  if (passType_ != PASS_TEST || useGlobalStats_) {
+    CHECK(mean_);
+    CHECK(mean_->getPrimitiveDesc() == pd->mean_primitive_desc());
+    CHECK(var_);
+    CHECK(var_->getPrimitiveDesc() == pd->variance_primitive_desc());
+  }
+}
+void MKLDNNBatchNormLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<bn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& out) {
+  if (passType_ == PASS_TEST) {
+    if (useGlobalStats_) {
+      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd,
+                                             *in,
+                                             (const primitive::at)(*mean_),
+                                             (const primitive::at)(*var_),
+                                             *wgt,
+                                             *out)
+                                : new bn_fwd(*pd,
+                                             *in,
+                                             (const primitive::at)(*mean_),
+                                             (const primitive::at)(*var_),
+                                             *out));
+    } else {
+      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out)
+                                : new bn_fwd(*pd, *in, *out));
+    }
+  } else {
+    CHECK_EQ(useGlobalStats_, false)
+        << "useGlobalStats should be false in training";
+    fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out, *mean_, *var_)
+                              : new bn_fwd(*pd, *in, *out, *mean_, *var_));
+  }
+  pipeline.push_back(*fwd_);
+}
+void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                           MKLDNNMatrixPtr& wgt,
+                                           MKLDNNMatrixPtr& out) {
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
+  if (gradScaleShift_) {
+    CHECK(wgtVal_);
+    resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
+  }
+}
+void MKLDNNBatchNormLayer::resetBwdPD(
+    std::shared_ptr<bn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  CHECK(out->getPrimitiveDesc() == in->getPrimitiveDesc());
+  auto md = in->getMemoryDesc();
+  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
+  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+  // TODO(TJ): use check macro
+  CHECK(wgt);
+  CHECK(wgt->getPrimitiveDesc() == pd->diff_weights_primitive_desc());
+  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
+  CHECK(mean_);
+  CHECK(mean_->getPrimitiveDesc() == pd->mean_primitive_desc());
+  CHECK(var_);
+  CHECK(var_->getPrimitiveDesc() == pd->variance_primitive_desc());
+}
+void MKLDNNBatchNormLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<bn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVal_);
+  bwdData_.reset(
+      wgt && wgtVal_
+          ? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          : new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+namespace paddle {
+typedef mkldnn::batch_normalization_forward bn_fwd;
+typedef mkldnn::batch_normalization_backward bn_bwd;
+/**
+ * @brief A subclass of MKLDNNLayer BatchNorm layer.
+ *
+ * The config file api is mkldnn_batch_norm
+ */
+class MKLDNNBatchNormLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
+  // Epsilon value used in the batch normalization formula.
+  static const real EPS;
+  // weight and bias in paddle
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+  // mkldnn use a large buffer store both scale and shift
+  // which are weight and bias in paddle corresponding.
+  MatrixPtr valueScaleShift_;
+  MatrixPtr gradScaleShift_;
+  // Moving average of mean.
+  std::unique_ptr<Weight> movingMean_;
+  // Moving average of variance.
+  std::unique_ptr<Weight> movingVar_;
+  // if useGlobalStats_ is true, will use the loaded mean and variance.
+  // otherwise, calculate mean and variance in every mini-batch.
+  bool useGlobalStats_;
+  // used in MKLDNN primitive desc
+  unsigned flags_;
+  // use to compute moving mean and variance.
+  real movingAvgFraction_;
+  // whether the weight has been init
+  bool hasInitedWgt_;
+  // local mean and variance
+  // when useGlobalStats_ they are loaded from moving mean and variance
+  // when do not useGlobalStats_ they are calculated from this mini-batch
+  MKLDNNMatrixPtr mean_;
+  MKLDNNMatrixPtr var_;
+public:
+  explicit MKLDNNBatchNormLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
+  ~MKLDNNBatchNormLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void updateWeights(const UpdateCallback& callback) override;
+  void convertWeightsFromPaddle() override;
+protected:
+  void initWeight();
+  /**
+   * cal moving mean and variance.
+   * moving = moving * AvgFraction + local * (1 - AvgFraction)
+   */
+  void calMovingMeanAndVar();
+  /**
+   * Forward functions: reset buffers(input, weight, output),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<bn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr wgt,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<bn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * Backward functions: reset buffers(input, weight, output),
+   *                     reset primitive descriptor,
+   *                     reset pipeline.
+   */
+  void resetBwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<bn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& wgt,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<bn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& out);
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -91,10 +91,16 @@ void MKLDNNTester::setInputImgSize() {
 // init randome parameters of ref, and copy to mkldnn
 void MKLDNNTester::randomWgtDatas() {
  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  const bool isBN = refLayer_->getType() == "batch_norm";
  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
    parameters_[REF][i]->randomize();
+    if (isBN && i == 2) {
+      // this param is moving average in batch norm, which must larger than 0
+      real offset = fabs(refValue->getMin()) + 1.0;
+      refValue->add(offset);
+    }
    dnnValue->copyFrom(*refValue);
    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
@@ -132,8 +138,7 @@ void MKLDNNTester::checkForward() {
 void MKLDNNTester::checkBackwardData() {
  VLOG(MKLDNN_TESTS) << "Check Backward Data";
-  // TODO(TJ): uncomment me when batch norm ready
+  const bool isBN = refLayer_->getType() == "batch_norm";
-  // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
@@ -144,11 +149,11 @@ void MKLDNNTester::checkBackwardData() {
    double delta = compareMatrix(dnnDiff, refDiff);
    EXPECT_LE(fabs(delta), eps_);
-    // TODO(TJ): uncomment me when batch norm ready
+    if (isBN) {
-    // if (isBN) {
+      // the other two inputs in batch norm are for moving mean and var
-    //  // the other two inputs in batch norm are for moving mean and var
+      // do not have grad to compare
-    //  break;
+      break;
-    // }
+    }
  }
 }
@@ -308,10 +313,14 @@ double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
 void MKLDNNTester::runOnce() {
  // test forward
  randomBotDatas();
-  dnnLayer_->forward(PASS_TRAIN);
+  dnnLayer_->forward(passType_);
-  refLayer_->forward(PASS_TRAIN);
+  refLayer_->forward(passType_);
  checkForward();
+  if (passType_ == PASS_TEST) {
+    return;
+  }
  // test backward
  // simple updater
  UpdateCallback updateCallback = [](Parameter* para) {
@@ -343,6 +352,7 @@ void MKLDNNTester::run(const TestConfig& dnn,
                       size_t batchSize,
                       size_t inputImgH,
                       size_t inputImgW,
+                       PassType passType,
                       bool printDetails,
                       size_t iter,
                       float epsilon) {
@@ -361,6 +371,7 @@ void MKLDNNTester::run(const TestConfig& dnn,
  ih_ = inputImgH;
  iw_ = inputImgW;
+  passType_ = passType;
  log_ = printDetails;
  iter_ = iter;
  eps_ = epsilon;

--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -62,12 +62,15 @@ protected:
  float eps_;
  /// input image size, default 1
  size_t ih_, iw_;
+  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
+  PassType passType_;
 public:
  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
    iter_ = iter;
    eps_ = epsilon;
    log_ = false;
+    passType_ = PASS_TRAIN;
  }
  ~MKLDNNTester() {}
@@ -78,6 +81,7 @@ public:
           size_t batchSize,
           size_t inputImgH = 1,
           size_t inputImgW = 1,
+           PassType passType = PASS_TRAIN,
           bool printDetails = false,
           size_t iter = 3,
           float epsilon = 1e-4);

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -212,6 +212,66 @@ TEST(MKLDNNLayer, PoolLayer) {
  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
 }
+struct testBatchNormDesc {
+  int bs;
+  int ic;
+  int ih, iw;
+};
+static void getMKLDNNBatchNormConfig(TestConfig& cfg,
+                                     const testBatchNormDesc& pm) {
+  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
+  cfg.layerConfig.set_type("mkldnn_batch_norm");
+  cfg.biasSize = pm.ic;
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.ic)});
+  cfg.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  // TODO(TJ): uncomment me when refine and support comparing all zeroes vector
+  // cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.add_inputs();
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(pm.ic);
+  img_conf->set_img_size_y(pm.ih);
+  img_conf->set_img_size(pm.iw);
+}
+void testBatchNormLayer(const testBatchNormDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNBatchNormConfig(dnnConfig, pm);
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("batch_norm");
+  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
+  VLOG(MKLDNN_TESTS) << "check train phase";
+  dnnConfig.layerConfig.set_use_global_stats(false);
+  refConfig.layerConfig.set_use_global_stats(false);
+  MKLDNNTester tester;
+  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
+  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
+  VLOG(MKLDNN_TESTS) << "check test phase";
+  for (auto useGS : {false, true}) {
+    dnnConfig.layerConfig.set_use_global_stats(useGS);
+    refConfig.layerConfig.set_use_global_stats(useGS);
+    MKLDNNTester tester;
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
+    }
+  }
+}
+TEST(MKLDNNLayer, BatchNormLayer) {
+  testBatchNormLayer({4, 10, 6, 6});
+  testBatchNormLayer({16, 32, 16, 16});
+}
 struct testActDesc {
  int bs, ic, ih, iw;
 };

--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -91,6 +91,11 @@ public:
      const MKLDNNMatrixPtr& dst,
      bool checkData = true);
+  void copyFrom(const Matrix& src) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    m_->copyFrom(src);
+  }
 public:
  /**
   * Reorder this MKLDNNMatrix from other format.

--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@@ -89,6 +89,36 @@ tmp = img_pool_layer(input=tmp,
            padding=1,
            pool_type=MaxPooling())
+tmp = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=LinearActivation(),
+            bias_attr=False)
+tmp = batch_norm_layer(input=tmp,
+            use_global_stats=False,
+            act=ReluActivation())
+c1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+c2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+tmp = addto_layer(input=[c1, c2],
+            act=ReluActivation(),
+            bias_attr=False)
 tmp = fc_layer(input=tmp, size=64,
            bias_attr=False,
            act=TanhActivation())

--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
@@ -38,9 +38,14 @@ tmp = img_pool_layer(input=tmp,
 tmp = img_conv_layer(input=tmp,
            filter_size=3,
-            num_filters=64,
+            num_filters=32,
            padding=1,
            shared_biases=True,
+            act=LinearActivation(),
+            bias_attr=False)
+tmp = batch_norm_layer(input=tmp,
+            use_global_stats=False,
            act=ReluActivation())
 tmp = img_pool_layer(input=tmp,

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2420,6 +2420,7 @@ class BatchNormLayer(LayerBase):
        # If not use is_static, even set learning_rate = 0, decay_rate = 0,
        # these paras will change if set average_window in configure.
        use_gpu = bool(int(g_command_config_args.get("use_gpu", 0)))
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
        is_shared = True if not use_gpu else False
        for i in xrange(2):
            inputs.append(
@@ -2433,11 +2434,17 @@ class BatchNormLayer(LayerBase):
        parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0)))
        cudnn_version = int(g_command_config_args.get("cudnn_version", 0))
-        # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU.
+        # Automatically select cudnn_batch_norm for GPU, batch_norm for CPU
-        # Also based on cudnn version.
+        # and mkldnn_batch_norm for MKLDNN. Also based on cudnn version.
+        if batch_norm_type == "mkldnn_batch_norm":
+            config_assert(use_mkldnn, "mkldnn_batch_norm only support MKLDNN")
        use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
+                not use_mkldnn and batch_norm_type != "mkldnn_batch_norm" and \
                ((not parallel_nn) or self.config.device > -1)
-        self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
+        if use_cudnn:
+            self.layer_type = "cudnn_batch_norm"
+        else:
+            self.layer_type = "mkldnn_batch_norm" if use_mkldnn else "batch_norm"
        super(BatchNormLayer, self).__init__(
            name, self.layer_type, 0, inputs=inputs, **xargs)

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3014,16 +3014,19 @@ def batch_norm_layer(input,
    :param input: batch normalization input. Better be linear activation.
                Because there is an activation inside batch_normalization.
    :type input: LayerOutput
-    :param batch_norm_type: We have batch_norm and cudnn_batch_norm. batch_norm
+    :param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm.
-                            supports both CPU and GPU. cudnn_batch_norm requires
+                            batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm
-                            cuDNN version greater or equal to v4 (>=v4). But
+                            requires cuDNN version greater or equal to v4 (>=v4).
-                            cudnn_batch_norm is faster and needs less memory
+                            But cudnn_batch_norm is faster and needs less
-                            than batch_norm. By default (None), we will
+                            memory than batch_norm. mkldnn_batch_norm requires
-                            automaticly select cudnn_batch_norm for GPU and
+                            enable use_mkldnn. By default (None), we will
-                            batch_norm for CPU. Otherwise, select batch norm
+                            automaticly select cudnn_batch_norm for GPU,
-                            type based on the specified type. If you use cudnn_batch_norm,
+                            mkldnn_batch_norm for MKLDNN and batch_norm for CPU.
+                            Otherwise, select batch norm type based on the
+                            specified type. If you use cudnn_batch_norm,
                            we suggested you use latest version, such as v5.1.
    :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
+                           or "mkldnn_batch_norm"
    :param act: Activation Type. Better be relu. Because batch
                     normalization will normalize input near zero.
    :type act: BaseActivation
@@ -3063,6 +3066,7 @@ def batch_norm_layer(input,
        else:
            num_channels = input.size
    assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
+           (batch_norm_type == "mkldnn_batch_norm") or \
           (batch_norm_type == "cudnn_batch_norm")
    l = Layer(
        name=name,