add roi operator unittest

36dd770a · wanghaox · 79609288 · 53bd51e3 · 36dd770a · 36dd770a
85 changed file
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -12,11 +12,11 @@ Machine:

 System: CentOS release 6.3 (Final), Docker 1.12.1.

-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
-
- MKL-DNN tag v0.10
- MKLML 2018.0.20170720
+PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+- MKL-DNN tag v0.11
+- MKLML 2018.0.1.20171007
 - OpenBLAS v0.2.20
+(TODO: will rerun after 0.11.0)
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.

@@ -31,15 +31,26 @@ Input image size - 3 * 224 * 224, Time: images/second

 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
+| MKLML        | 12.12 | 13.70 | 16.18  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
+
+
+chart on batch size 128
+TBD
+
+ - ResNet-50
+
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
+| MKLML        | 32.52 | 31.89 | 33.12  |
+| MKL-DNN      | 81.69 | 82.35 | 84.08  |


 chart on batch size 128
 TBD

- - ResNet
 - GoogLeNet

 ### Laptop

--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(sequence_softmax)

+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
 /**
 * @brief Relu Activation.
 * forward. y = max(0, z)

--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
 }

 void MKLDNNAddtoLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
  reshapeInput(bs, ih, iw);
  ic = inputLayers_[0]->getSize() / ih / iw;
  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
  for (size_t i = 0; i < inputLayers_.size(); i++) {
    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
@@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape(
 }

 void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inVals_, bias, out);
-  in = inVals_[0];
+  resetFwdBuffers(inputs, biasVal_, out);

  std::shared_ptr<sum::primitive_desc> fwdPD;
  std::shared_ptr<sum::primitive_desc> biasPD;
-  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);

-  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
 }

 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& wgt,
-                                MKLDNNMatrixPtr& bias,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, bias, out);
-  in = inGrads_[0];
+  resetBwdBuffers(inputs, biasGrad_, out);

  // backward only need share output grad to input grad
-  for (size_t i = 0; i < inGrads_.size(); i++) {
-    if (inGrads_[i] != nullptr) {
-      inGrads_[i] = out;
-      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i] != nullptr) {
+      inputs[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
    }
  }

  // backward bias
  bwdBias_ = nullptr;
-  if (bias) {
+  if (biasGrad_) {
    std::vector<float> scales(bs_, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
-    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+    std::vector<memory::primitive_desc> srcPDs(bs_,
+                                               biasGrad_->getPrimitiveDesc());
+    auto biasPD =
+        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
    std::vector<primitive::at> srcs;
    for (size_t i = 0; i < grads_.size(); ++i) {
      srcs.push_back(*(grads_[i]));
    }
-    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
    pipeline.push_back(*bwdBias_);
  }
 }
@@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,

  inputs.resize(inputLayers_.size());
  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
  }


--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -26,9 +26,6 @@ namespace paddle {
 */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
 protected:
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-
  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
  size_t layerSize_;

@@ -50,52 +47,19 @@ public:
            const ParameterMap& parameterMap) override;

  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;

  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void updateWeights(const UpdateCallback& callback) override;

-  void printValueFormat() override {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormat() override {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
-    }
-  }
-
 protected:
-  /**
-   * Forward functions: reset buffers(inputs, output, bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
@@ -110,17 +74,10 @@ protected:
                        std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(inputs, output, bias)
-   */
  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);

-  /**
-   * prepare for bias
-   */
  void prepareBias(MKLDNNMatrixPtr& bias,
                   const MatrixPtr& biasMat,
                   const MKLDNNMatrixPtr& out,

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -116,21 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
 }

 void MKLDNNBatchNormLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);
  oh = ih;
  ow = iw;
  // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
      << "Input channel can not be changed";
  reshapeOutput(oh, ow);
  resizeOutput(bs, oc * oh * ow);
 }

 void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                    MKLDNNMatrixPtr& out) {
  // In training phase, it will always calculate mean and var,
  // so useGlobalStats must be false.
@@ -140,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
    useGlobalStats_ = false;
  }

-  resetFwdBuffers(in, wgt, out);
+  resetFwdBuffers(inputs[0], wgtVal_, out);

-  resetFwdPD(fwdPD_, in, wgt, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);

-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
 }

 void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                    MKLDNNMatrixPtr& out) {
  std::shared_ptr<bn_bwd::primitive_desc> pd;

-  resetBwdBuffers(in, wgt, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, out);

-  resetBwdPD(pd, in, wgt, out);
+  resetBwdPD(pd, inputs[0], wgtGrad_, out);

-  resetBwdPipeline(pipeline, pd, in, wgt, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
 }

 void MKLDNNBatchNormLayer::forward(PassType passType) {
@@ -260,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline(
 void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                           MKLDNNMatrixPtr& wgt,
                                           MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
  if (gradScaleShift_) {
    CHECK(wgtVal_);
    resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
@@ -297,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline(
  if (pd == nullptr) {
    return;
  }
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
  bwdData_.reset(
      wgt && wgtVal_
-          ? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt)
-          : new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in));
+          ? new bn_bwd(
+                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
  pipeline.push_back(*bwdData_);
 }


--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -73,18 +73,14 @@ public:
  void forward(PassType passType) override;

  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;

  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void updateWeights(const UpdateCallback& callback) override;
@@ -98,11 +94,7 @@ protected:
   * moving = moving * AvgFraction + local * (1 - AvgFraction)
   */
  void calMovingMeanAndVar();
-  /**
-   * Forward functions: reset buffers(input, weight, output),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
+
  void resetFwdBuffers(MKLDNNMatrixPtr& in,
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& out);
@@ -115,12 +107,6 @@ protected:
                        MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, weight, output),
-   *                     reset primitive descriptor,
-   *                     reset pipeline.
-   */
  void resetBwdBuffers(MKLDNNMatrixPtr& in,
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& out);

--- a/paddle/gserver/layers/MKLDNNConcatLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
@@ -32,17 +32,16 @@ bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
 }

 void MKLDNNConcatLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);
  ic = inputLayers_[0]->getSize() / ih / iw;
  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
  CHECK_GT(inputLayers_.size(), 1UL);
  channels_.resize(inputLayers_.size());
  channels_[0] = ic;
-  // need change the output channel, so use oc_ instead
-  // TODO(TJ): change API, use &oc
-  oc_ = ic;
+  oc = ic;
  for (size_t i = 1; i < inputLayers_.size(); i++) {
    int batchsize, height, witdh;
    reshapeInput(batchsize, height, witdh, i);
@@ -52,37 +51,31 @@ void MKLDNNConcatLayer::reshape(

    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
-    oc_ += channels_[i];
+    oc += channels_[i];
  }
  oh = ih;
  ow = iw;
  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc_ * oh * ow);
+  resizeOutput(bs, oc * oh * ow);
 }

 void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
-                                 MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& wgt,
-                                 MKLDNNMatrixPtr& bias,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
                                 MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inVals_, out);
-  in = inVals_[0];
+  resetFwdBuffers(inputs, out);

  std::shared_ptr<concat::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inVals_, out);
+  resetFwdPD(fwdPD, inputs, out);

-  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, inputs, out);
 }

 void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
-                                 MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& wgt,
-                                 MKLDNNMatrixPtr& bias,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
                                 MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, out);
-  in = inGrads_[0];
+  resetBwdBuffers(inputs, out);

-  resetBwdPipeline(pipeline, bwds_, inGrads_, out);
+  resetBwdPipeline(pipeline, bwds_, inputs, out);
 }

 void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
@@ -90,10 +83,7 @@ void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
  inputs.resize(inputLayers_.size());
  bool has8c = false, has16c = false, hasnc = false;
  for (size_t i = 0; i < inputs.size(); i++) {
-    // resetInValue will use ic_ so temporary change as current input's channel
-    // TODO(TJ): change ic_ as vector then can remove channels_
-    ic_ = channels_[i];
-    resetInValue(inputs[i], nullptr, i);
+    resetInValue(inputs[i], nullptr, i, channels_[i]);
    CHECK(inputs[i]);
    auto dm = inputs[i]->getDims();
    // inputs format can be different, but ndims must equal
@@ -114,8 +104,6 @@ void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
      has16c = true;
    }
  }
-  // change back, ic_ always save the input 0 size
-  ic_ = channels_[0];

  format outFmt;
  if (has16c && oc_ % 16 == 0) {
@@ -168,14 +156,9 @@ void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
  inputs.resize(inputLayers_.size());
  for (size_t i = 0; i < inputs.size(); i++) {
    CHECK(inVals_[i]);
-    // resetInGrad will use inVal_
-    // TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_
-    inVal_ = inVals_[i];
    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
  }
-  // change back, inVal_ always save the input 0
-  inVal_ = inVals_[0];
 }

 void MKLDNNConcatLayer::resetBwdPipeline(

--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -26,8 +26,6 @@ namespace paddle {
 */
 class MKLDNNConcatLayer : public MKLDNNLayer {
 protected:
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
  // input channel numbers
  std::vector<int> channels_;
@@ -47,18 +45,14 @@ public:
            const ParameterMap& parameterMap) override;

  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;

  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void printSizeInfo() override {
@@ -72,38 +66,16 @@ public:
                       << ", " << ow_;
  }

-  void printValueFormat() override {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormat() override {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << inGrads_[i]->getFormat() << "<<<";
+  size_t keepCondition() {
+    // reset when the total element size of all inputs changed
+    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
+    for (size_t i = 1; i < inputLayers_.size(); ++i) {
+      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
    }
+    return totalSize;
  }

 protected:
-  /**
-   * Forward functions: reset buffers(inputs, output, bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
@@ -113,11 +85,6 @@ protected:
                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
                        std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(inputs, output, bias)
-   *                     reset primitives and pipeline
-   */
  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                       MKLDNNMatrixPtr& out);
  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -90,7 +90,7 @@ void MKLDNNConvLayer::convertWeightsToPaddle() {
 }

 void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);

  // cal output sizes
@@ -105,21 +105,17 @@ void MKLDNNConvLayer::reshape(
 }

 void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                               MKLDNNMatrixPtr& out) {
  resetFwdPD(fwdPD_);

-  resetFwdBuffers(fwdPD_, in, wgt, bias, out);
+  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);

-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 }

 void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                               MKLDNNMatrixPtr& out) {
  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
@@ -128,9 +124,10 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,

  resetBwdDataPD(bwdDataPD);

-  resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);

-  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 }

 void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
@@ -236,14 +233,14 @@ void MKLDNNConvLayer::resetBwdWgtPD(
  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);

  // create backward weight using input, output and weight value memory desc
-  CHECK(inVal_) << "Should have internal input value";
+  CHECK(inVals_[0]) << "Should have internal input value";
  CHECK(outVal_) << "Should have internal output value";
  CHECK(wgtVal_) << "Should have weight value";
  algorithm algo = algorithm::convolution_direct;
  padding_kind padKind = padding_kind::zero;
  auto bwdWgtDesc = biasVal_ != nullptr
                        ? conv_bwdWgt::desc(algo,
-                                            inVal_->getMemoryDesc(),
+                                            inVals_[0]->getMemoryDesc(),
                                            wgtVal_->getMemoryDesc(),
                                            biasVal_->getMemoryDesc(),
                                            outVal_->getMemoryDesc(),
@@ -252,7 +249,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                            padR,
                                            padKind)
                        : conv_bwdWgt::desc(algo,
-                                            inVal_->getMemoryDesc(),
+                                            inVals_[0]->getMemoryDesc(),
                                            wgtVal_->getMemoryDesc(),
                                            outVal_->getMemoryDesc(),
                                            strides,
@@ -260,7 +257,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                            padR,
                                            padKind);
  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
  CHECK_PRIMITIVE_DESC_EQ(
      outVal_,
      pd->diff_dst_primitive_desc(),
@@ -280,12 +277,12 @@ void MKLDNNConvLayer::resetBwdDataPD(

  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVal_) << "Should have internal input value";
+  CHECK(inVals_[0]) << "Should have internal input value";
  CHECK(outVal_) << "Should have internal output value";
  // create backward data using input and output value memory desc
  // but using weight memory desc with any format
  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVal_->getMemoryDesc(),
+                                        inVals_[0]->getMemoryDesc(),
                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
                                        outVal_->getMemoryDesc(),
                                        strides,
@@ -294,7 +291,7 @@ void MKLDNNConvLayer::resetBwdDataPD(
                                        padding_kind::zero);
  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
  CHECK_PRIMITIVE_DESC_EQ(
-      inVal_,
+      inVals_[0],
      pd->diff_src_primitive_desc(),
      "primitive desc of in value and grad should be equal");
  CHECK_PRIMITIVE_DESC_EQ(
@@ -346,12 +343,12 @@ void MKLDNNConvLayer::resetBwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
  // add bwdWgt handle
  if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
  } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt));
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
  }
  pipeline.push_back(*bwdWgt_);


--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -69,18 +69,14 @@ public:
            const ParameterMap& parameterMap) override;

  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;

  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void updateWeights(const UpdateCallback& callback) override;
@@ -107,48 +103,26 @@ protected:
                        mkldnn::memory::dims& padL,
                        mkldnn::memory::dims& padR);

-  /**
-   * reset the forward primitive descriptor.
-   */
  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
-  /**
-   * reset the MKLDNNMatrix buffers used in forward.
-   */
  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                       MKLDNNMatrixPtr& in,
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
-  /**
-   * reset the forward pipeline.
-   */
  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
                        MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-
-  /**
-   * reset the backward weight primitive descriptor.
-   */
  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
-  /**
-   * reset the backward data primitive descriptor.
-   */
  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
-  /**
-   * reset the MKLDNNMatrix buffers used in backward.
-   */
  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
                       MKLDNNMatrixPtr& in,
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
-  /**
-   * reset the backward pipeline.
-   */
  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -74,7 +74,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
 }

 void MKLDNNFcLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);

  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
@@ -87,32 +87,29 @@ void MKLDNNFcLayer::reshape(
 }

 void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
-                             MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& bias,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
                             MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(in, wgt, bias, out);
+  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);

-  resetFwdPD(fwdPD_, in, wgt, bias, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);

-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 }

 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
-                             MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& bias,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
                             MKLDNNMatrixPtr& out) {
  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;

-  resetBwdBuffers(in, wgt, bias, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);

-  resetBwdWgtPD(bwdWgtPD, wgt, bias, out);
+  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);

-  resetBwdDataPD(bwdDataPD, in, out);
+  resetBwdDataPD(bwdDataPD, inputs[0], out);

-  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 }

 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@@ -193,9 +190,9 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                    MKLDNNMatrixPtr& wgt,
                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());

  CHECK(wgtVal_);
  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
@@ -212,14 +209,15 @@ void MKLDNNFcLayer::resetBwdWgtPD(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
-  fc_bwdWgt::desc bwdWgtDesc = bias ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                      wgt->getMemoryDesc(),
-                                                      bias->getMemoryDesc(),
-                                                      out->getMemoryDesc())
-                                    : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                      wgt->getMemoryDesc(),
-                                                      out->getMemoryDesc());
+  CHECK(inVals_[0]);
+  fc_bwdWgt::desc bwdWgtDesc =
+      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             bias->getMemoryDesc(),
+                             out->getMemoryDesc())
+           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             out->getMemoryDesc());
  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
 }

@@ -245,11 +243,11 @@ void MKLDNNFcLayer::resetBwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
  if (bias) {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
  } else {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt));
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
  }
  pipeline.push_back(*bwdWgt_);


--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -52,18 +52,14 @@ public:
            const ParameterMap& parameterMap) override;

  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;

  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void updateWeights(const UpdateCallback& callback) override;
@@ -73,11 +69,6 @@ public:
  void convertWeightsToPaddle() override;

 protected:
-  /**
-   * Forward functions: reset buffers(input, output, weight and bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
  void resetFwdBuffers(MKLDNNMatrixPtr& in,
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
@@ -93,13 +84,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, output, weight and bias),
-   *                     reset primitive descriptor for backward weight,
-   *                     reset primitive descriptor for backward data,
-   *                     reset pipeline.
-   */
  void resetBwdBuffers(MKLDNNMatrixPtr& in,
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,

--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -48,31 +48,20 @@ void MKLDNNLayer::forward(PassType passType) {
    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
    CHECK(!inputLayers_.empty());
    copySeqInfoToOutputs();
-    size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
-    if (inputElemenCnt_ != elemenCnt) {
+    if (condition_ != keepCondition()) {
      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-      // reset when input total sizes changed, not only the batchsize
-      inputElemenCnt_ = elemenCnt;
-      pipelineFwd_.clear();
+      condition_ = keepCondition();
      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-      // all cpu device output grad or value share output's
+      printSizeInfo();
+      // the output_.value and output_.grad are shared with CPU device
      shareCPUDevice();
-      resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-      // MKLDNNLayer output value should be MKLDNNMatrix
-      // so external output value is necessary.
-      // Then external input value is not necessary,
-      // since input may be mkldnn internal buffer.
-      CHECK(extOutVal_) << "external output value is necessary";
-      output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-      CHECK(inVal_ && outVal_) << "internal memories are necessary";
-      if (cvtInVal_) {
-        pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
-      }
-      if (cvtOutVal_) {
-        pipelineFwd_.push_back(*cvtOutVal_);
-      }
+      pipelineFwd_.clear();
+      inVals_.resize(inputLayers_.size(), nullptr);
+      extInVals_.resize(inputLayers_.size(), nullptr);
+      cvtInVals_.resize(inputLayers_.size(), nullptr);
+      resetFwd(pipelineFwd_, inVals_, outVal_);
+      prepareValueConversions(pipelineFwd_);
      convertWeightsFromPaddle();
-      printSizeInfo();
      printValueFormat();
      needResetBwd_ = true;
    }
@@ -80,8 +69,8 @@ void MKLDNNLayer::forward(PassType passType) {
    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
      // Update input value data when input layer is "data" type,
      // since the input value data address might be changed.
-      CHECK(extInVal_);
-      extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+      CHECK(extInVals_[0]);
+      extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
    }

    if (!outputOnlyMKLDNN_) {
@@ -99,22 +88,13 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
  if (needResetBwd_) {
    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
    pipelineBwd_.clear();
+    inGrads_.resize(inputLayers_.size(), nullptr);
+    extInGrads_.resize(inputLayers_.size(), nullptr);
+    cvtInGrads_.resize(inputLayers_.size(), nullptr);
    pipelineMergeGrad_.clear();
    mergeGrad_ = nullptr;
-    resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
-    }
-    if (cvtInGrad_) {
-      pipelineBwd_.push_back(*cvtInGrad_);
-    }
+    resetBwd(pipelineBwd_, inGrads_, outGrad_);
+    prepareGradConversions(pipelineBwd_);
    printGradFormat();
    needResetBwd_ = false;
  }
@@ -141,8 +121,8 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
 void MKLDNNLayer::reshapeInput(int& batchsize,
                               int& height,
                               int& width,
-                               size_t inputIdx) {
-  const Argument& input = inputLayers_[inputIdx]->getOutput();
+                               size_t idx) {
+  const Argument& input = inputLayers_[idx]->getOutput();
  batchsize = input.getBatchSize();
  int h = input.getFrameHeight();
  int w = input.getFrameWidth();
@@ -176,27 +156,30 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
 void MKLDNNLayer::resetInValue(
    MKLDNNMatrixPtr& in,
    const std::shared_ptr<memory::primitive_desc>& intPD,
-    size_t inputIdx) {
-  cvtInVal_ = nullptr;
-  extInVal_ = nullptr;
+    size_t idx,
+    int inputChannel) {
+  cvtInVals_[idx] = nullptr;
+  extInVals_[idx] = nullptr;
  in = nullptr;
-  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+  inputChannel = inputChannel == 0 ? ic_ : inputChannel;
+  CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
-      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
-  extInVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr);
-  if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) {
-    extInVal_ = MKLDNNMatrix::create(extPD, inMat);
+      {bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
+  extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
+  if (extInVals_[idx] == nullptr ||
+      extInVals_[idx]->getFormat() == format::nc) {
+    extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
  }
-  in = extInVal_;
+  in = extInVals_[idx];
  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
    return;
  }
  // need create reorder
  in = MKLDNNMatrix::create(*intPD);
-  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
-  CHECK(cvtInVal_) << "should not be emptry";
+  cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
+  CHECK(cvtInVals_[idx]) << "should not be emptry";
 }

 void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
@@ -218,11 +201,11 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,

 void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
                              memory::primitive_desc intPD,
-                              size_t inputIdx) {
-  cvtInGrad_ = nullptr;
-  extInGrad_ = nullptr;
+                              size_t idx) {
+  cvtInGrads_[idx] = nullptr;
+  extInGrads_[idx] = nullptr;
  in = nullptr;
-  LayerPtr& input = inputLayers_[inputIdx];
+  LayerPtr& input = inputLayers_[idx];
  if (input->getOutputGrad() == nullptr) {
    // no need input grad
    return;
@@ -237,23 +220,25 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
  in = MKLDNNMatrix::create(intPD, inMat);
  Argument& arg = input->getOutput(this->getName());
  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
  if (inputIsOnlyMKLDNN()) {
    return;
  }

-  extInGrad_ = in;
-  if (isPaddleFormat(extInGrad_->getFormat())) {
+  extInGrads_[idx] = in;
+  if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
    return;
  }
  // need create reorder
-  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+  CHECK(extInVals_[idx] != nullptr &&
+        isPaddleFormat(extInVals_[idx]->getFormat()))
      << "should have external input value and the format must be nchw(nc)";
-  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
+  extInGrads_[idx] =
+      MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
  in = MKLDNNMatrix::create(intPD);
-  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
-  CHECK(cvtInGrad_);
+  cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
+  CHECK(cvtInGrads_[idx]);
 }

 void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
@@ -309,22 +294,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
    srcs.push_back(*src);
  }

-  // TODO(TJ): remove me when mkldnn sum support different formats
-  for (size_t i = 1; i < srcPDs.size(); ++i) {
-    CHECK(srcPDs[0] == srcPDs[i]);
-  }
-  tmpOutGrad_ = out;
-  tmpCvt_ = nullptr;
-  if (out->getPrimitiveDesc() != srcPDs[0]) {
-    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
-    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
-    CHECK(tmpCvt_);
-    pipelineMergeGrad_.push_back(*tmpCvt_);
-  }
-
-  auto sumPD =
-      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
-  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *out));
  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
 }


--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -34,15 +34,16 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
 */
 class MKLDNNLayer : public Layer {
 protected:
-  // input value element count
-  size_t inputElemenCnt_;
  // batch size
  int bs_;
+  // their sizes are always from the first input layer
  // input image channel, height and width
  int ic_, ih_, iw_;
  // output image channel, height and width
  int oc_, oh_, ow_;

+  // the condition that forward need be reset
+  size_t condition_;
  // backward also need reset after reset forward handle
  bool needResetBwd_;

@@ -67,18 +68,18 @@ protected:
   * When all layers are mkldnn layers, they could save internal data.
   */
  // below MKLDNNMatrix buffers are all internal buffers
-  MKLDNNMatrixPtr inVal_;
-  MKLDNNMatrixPtr inGrad_;
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
  MKLDNNMatrixPtr outVal_;
  MKLDNNMatrixPtr outGrad_;
  // below are external value and grad
-  MKLDNNMatrixPtr extInVal_;
-  MKLDNNMatrixPtr extInGrad_;
+  std::vector<MKLDNNMatrixPtr> extInVals_;
+  std::vector<MKLDNNMatrixPtr> extInGrads_;
  MKLDNNMatrixPtr extOutVal_;
  MKLDNNMatrixPtr extOutGrad_;
  // convert handle between external and internal buffers
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;

@@ -93,23 +94,11 @@ protected:
  std::vector<mkldnn::primitive> pipelineMergeGrad_;
  // tmp input argument to save input grad, only used to merge grad
  Argument tmpInArg_;
-  // since mkldnn sum do not support different formats:
-  // can refer to https://github.com/01org/mkl-dnn/issues/134
-  // so need create reorder manually and save tmp MKLDNNMatrix
-  MKLDNNMatrixPtr tmpOutGrad_;
-  std::shared_ptr<mkldnn::primitive> tmpCvt_;

 public:
  explicit MKLDNNLayer(const LayerConfig& config)
      : Layer(config),
-        inputElemenCnt_(0),
-        bs_(0),
-        ic_(0),
-        ih_(0),
-        iw_(0),
-        oc_(0),
-        oh_(0),
-        ow_(0),
+        condition_(0),
        needResetBwd_(true),
        outputOnlyMKLDNN_(false),
        engine_(mkldnn::engine::cpu, 0),
@@ -125,31 +114,28 @@ public:
  virtual void backward(const UpdateCallback& callback);

  /**
-   * reshape the input image sizes
-   * and reset output image and buffer size
-   * output channel can not be changed
+   * reshape the input and output channels and image sizes
+   * and reset output buffer size
   */
  virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;

  /**
   * reset the mkldnn forward primitve and memories
   * only would be called when input size changes
+   * weight and bias buffers should be coverd by child class itself
   */
  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& out) = 0;

  /**
   * reset the mkldnn backward primitve and memories
   * only would be called when needed
+   * weight and bias buffers should be coverd by child class itself
   */
  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& out) = 0;

  /**
@@ -175,13 +161,19 @@ public:
  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }

 protected:
+  /**
+   * Some layers may have different condition to reset the forward.
+   * The function returns the condition that do not need reset forward.
+   */
+  inline virtual size_t keepCondition() {
+    // reset when the first input element size changed, not only the batchsize
+    return inputLayers_[0]->getOutputValue()->getElementCnt();
+  }
+
  /**
   * reshape the input image sizes and input batchsize
   */
-  void reshapeInput(int& batchsize,
-                    int& height,
-                    int& width,
-                    size_t inputIdx = 0);
+  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);

  /**
   * reshape output image sizes
@@ -199,11 +191,13 @@ protected:
  /**
   * reset input value from input MKLDNNMatrix and internal primitive desc.
   * reset both internal and external buffer and create reorder if necessary.
+   * input channel may be different in concat.
   */
  void resetInValue(
      MKLDNNMatrixPtr& in,
      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
-      size_t inputIdx = 0);
+      size_t idx = 0,
+      int inputChannel = 0);

  /**
   * reset output value from internal primitive desc.
@@ -218,7 +212,7 @@ protected:
   */
  void resetInGrad(MKLDNNMatrixPtr& in,
                   mkldnn::memory::primitive_desc intPD,
-                   size_t inputIdx = 0);
+                   size_t idx = 0);

  /**
   * reset output grad from internal primitive desc.
@@ -296,17 +290,19 @@ protected:
   * print the mkldnn memory format of value
   */
  virtual void printValueFormat() {
-    if (extInVal_) {
-      VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
-    }
-    if (inVal_) {
-      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      if (!inVals_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
+                                                  : inVals_[i]->getFormat())
+                        << " >>> " << inVals_[i]->getFormat() << " >>>";
    }
    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
+                        << (extOutVal_ ? extOutVal_->getFormat()
+                                       : outVal_->getFormat());
    }
    if (wgtVal_) {
      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
@@ -320,17 +316,19 @@ protected:
   * print the mkldnn memory format of grad
   */
  virtual void printGradFormat() {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
+                        << (extOutGrad_ ? extOutGrad_->getFormat()
+                                        : outGrad_->getFormat());
    }
-    if (inGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
-    }
-    if (extInGrad_) {
-      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      if (!inGrads_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
+                                                   : inGrads_[i]->getFormat())
+                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
    }
    if (wgtGrad_) {
      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
@@ -437,6 +435,41 @@ private:
      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
    }
  }
+
+  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // MKLDNNLayer output value should be MKLDNNMatrix
+    // so external output value is necessary.
+    // Then external input value is not necessary,
+    // since input may be mkldnn internal buffer.
+    CHECK(extOutVal_) << "external output value is necessary";
+    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
+    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
+      if (cvtInVals_[i]) {
+        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
+      }
+    }
+    if (cvtOutVal_) {
+      pipeline.push_back(*cvtOutVal_);
+    }
+  }
+  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
+    }
+    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
+      if (cvtInGrads_[i]) {
+        pipeline.push_back(*cvtInGrads_[i]);
+      }
+    }
+  }
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -58,10 +58,11 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
 }

 void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
  reshapeInput(bs, ih, iw);
  // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
      << "Input channel can not be changed";

  // cal output sizes
@@ -74,29 +75,25 @@ void MKLDNNPoolLayer::reshape(
 }

 void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                               MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(in, out);
+  resetFwdBuffers(inputs[0], out);

-  resetFwdPD(fwdPD_, in, out);
+  resetFwdPD(fwdPD_, inputs[0], out);

-  resetFwdPipeline(pipeline, fwdPD_, in, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
 }

 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                               MKLDNNMatrixPtr& out) {
  std::shared_ptr<pool_bwd::primitive_desc> pd;

-  resetBwdBuffers(in, out);
+  resetBwdBuffers(inputs[0], out);

-  resetBwdPD(pd, in, out);
+  resetBwdPD(pd, inputs[0], out);

-  resetBwdPipeline(pipeline, pd, in, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
 }

 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
@@ -151,9 +148,9 @@ void MKLDNNPoolLayer::resetFwdPipeline(

 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                      MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
 }

 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,

--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -53,18 +53,14 @@ public:
            const ParameterMap& parameterMap) override;

  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;

  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                MKLDNNMatrixPtr& out) override;

  void printSizeInfo() override {
@@ -75,11 +71,6 @@ public:
  }

 protected:
-  /**
-   * Forward functions: reset buffers(input, output),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr in,
@@ -88,12 +79,6 @@ protected:
                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
                        MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, output),
-   *                     reset primitive descriptor,
-   *                     reset pipeline.
-   */
  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr& in,

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -315,7 +315,7 @@ TEST(MKLDNNLayer, AddtoLayer) {

 static void getMKLDNNConcatConfig(TestConfig& cfg,
                                  const std::vector<testImageDesc>& inputs) {
-  CHECK_GE(inputs.size(), 2) << "at least two inputs";
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
  int oc = inputs[0].ic;
  for (size_t i = 1; i < inputs.size(); ++i) {
    CHECK_EQ(inputs[i].bs, inputs[0].bs);

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -184,6 +184,7 @@ set(DEPS_OPS
    sequence_softmax_op
    sum_op
    pool_op
+    maxout_op
    pool_with_index_op
    conv_op
    conv_transpose_op
@@ -210,6 +211,7 @@ op_library(sgd_op DEPS selected_rows_functor)
 op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
+op_library(maxout_op DEPS maxouting)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -98,7 +98,6 @@ $y = \max(x, 0)$
  }
 };

-template <typename AttrType>
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  LeakyReluOpMaker(framework::OpProto *proto,
@@ -106,8 +105,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of LeakyRelu operator");
    AddOutput("Y", "Output of LeakyRelu operator");
-    AddAttr<AttrType>("alpha", "The small negative slope")
-        .SetDefault(static_cast<AttrType>(0.02f));
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
    AddComment(R"DOC(
 LeakyRelu Activation Operator.

@@ -117,7 +115,6 @@ $y = \max(x, \alpha * x)$
  }
 };

-template <typename AttrType>
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  SoftShrinkOpMaker(framework::OpProto *proto,
@@ -125,8 +122,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softshrink operator");
    AddOutput("Y", "Output of Softshrink operator");
-    AddAttr<AttrType>("lambda", "non-negative offset")
-        .SetDefault(static_cast<AttrType>(0.5f));
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
    AddComment(R"DOC(
 Softshrink Activation Operator.

@@ -173,7 +169,6 @@ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
  }
 };

-template <typename AttrType>
 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  HardShrinkOpMaker(framework::OpProto *proto,
@@ -181,8 +176,8 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardShrink operator");
    AddOutput("Y", "Output of HardShrink operator");
-    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
    AddComment(R"DOC(
 HardShrink Activation Operator.

@@ -308,17 +303,16 @@ $$y = \frac{x}{1 + |x|}$$
  }
 };

-template <typename AttrType>
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of BRelu operator");
    AddOutput("Y", "Output of BRelu operator");
-    AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(0));
-    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(24));
+    AddAttr<float>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<float>(0));
+    AddAttr<float>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<float>(24));
    AddComment(R"DOC(
 BRelu Activation Operator.

@@ -328,7 +322,6 @@ $y = \max(\min(x, t_{min}), t_{max})$
  }
 };

-template <typename AttrType>
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  SoftReluOpMaker(framework::OpProto *proto,
@@ -336,8 +329,8 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of SoftRelu operator");
    AddOutput("Y", "Output of SoftRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
-        .SetDefault(static_cast<AttrType>(40));
+    AddAttr<float>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(40.0f);
    AddComment(R"DOC(
 SoftRelu Activation Operator.

@@ -347,15 +340,13 @@ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
  }
 };

-template <typename AttrType>
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ELU operator");
    AddOutput("Y", "Output of ELU operator");
-    AddAttr<AttrType>("alpha", "The alpha value of ELU")
-        .SetDefault(static_cast<AttrType>(1.0f));
+    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
    AddComment(R"DOC(
 ELU Activation Operator.

@@ -368,15 +359,14 @@ $y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
  }
 };

-template <typename AttrType>
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu6 operator");
    AddOutput("Y", "Output of Relu6 operator");
-    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
-        .SetDefault(static_cast<AttrType>(6));
+    AddAttr<float>("threshold", "The threshold value of Relu6")
+        .SetDefault(6.0f);
    AddComment(R"DOC(
 Relu6 Activation Operator.

@@ -386,15 +376,13 @@ $y = \min(\max(0, x), 6)$
  }
 };

-template <typename AttrType>
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Pow operator");
    AddOutput("Y", "Output of Pow operator");
-    AddAttr<AttrType>("factor", "The exponential factor of Pow")
-        .SetDefault(static_cast<AttrType>(1));
+    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
    AddComment(R"DOC(
 Pow Activation Operator.

@@ -404,17 +392,16 @@ $y = x^{factor}$
  }
 };

-template <typename AttrType>
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of STanh operator");
    AddOutput("Y", "Output of STanh operator");
-    AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
-        .SetDefault(static_cast<AttrType>(2 / 3));
-    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
-        .SetDefault(static_cast<AttrType>(1.7159));
+    AddAttr<float>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(2.0f / 3.0f);
+    AddAttr<float>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(1.7159f);
    AddComment(R"DOC(
 STanh Activation Operator.

@@ -424,7 +411,6 @@ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
  }
 };

-template <typename AttrType>
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ThresholdedReluOpMaker(framework::OpProto *proto,
@@ -432,8 +418,8 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Y", "Output of ThresholdedRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold location of activation")
-        .SetDefault(static_cast<AttrType>(1.0));
+    AddAttr<float>("threshold", "The threshold location of activation")
+        .SetDefault(1.0f);
    AddComment(R"DOC(
 ThresholdedRelu Activation Operator.

@@ -448,7 +434,6 @@ $$
  }
 };

-template <typename AttrType>
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  HardSigmoidOpMaker(framework::OpProto *proto,
@@ -456,10 +441,10 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardSigmoid operator");
    AddOutput("Y", "Output of HardSigmoid operator");
-    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.2));
-    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(0.2f);
+    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(0.5f);
    AddComment(R"DOC(
 HardSigmoid Activation Operator.

@@ -499,7 +484,7 @@ REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
            tanh_shrink_grad, ops::ActivationOpGrad);

-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker<float>,
+REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
            softshrink_grad, ops::ActivationOpGrad);

 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
@@ -523,35 +508,34 @@ REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
 REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
            ops::ActivationOpGrad);

-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
            ops::ActivationOpGrad);

-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
            leaky_relu_grad, ops::ActivationOpGrad);

-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
-            soft_relu_grad, ops::ActivationOpGrad);
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
+            ops::ActivationOpGrad);

-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker<float>, elu_grad,
+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
            ops::ActivationOpGrad);

-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker<float>, relu6_grad,
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
            ops::ActivationOpGrad);

-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
            ops::ActivationOpGrad);

-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
            ops::ActivationOpGrad);

-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker<float>,
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
            hard_shrink_grad, ops::ActivationOpGrad);

-REGISTER_OP(thresholded_relu, ops::ActivationOp,
-            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
+            thresholded_relu_grad, ops::ActivationOpGrad);

-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
            hard_sigmoid_grad, ops::ActivationOpGrad);

 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \

--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -109,4 +109,5 @@ paramOut = param + paramUpdate$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -17,4 +17,5 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
+    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@@ -33,8 +33,8 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());

-    float rho = ctx.Attr<float>("rho");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T rho = static_cast<T>(ctx.Attr<float>("rho"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));

    auto param = framework::EigenVector<T>::Flatten(
        *ctx.Input<framework::Tensor>("Param"));

--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -14,8 +14,8 @@

 #define EIGEN_USE_GPU
 #include "paddle/operators/adagrad_op.h"
-#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
 #include "paddle/platform/cuda_helper.h"

 namespace paddle {
@@ -134,8 +134,8 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
        T, 256><<<grid2, threads, 0,
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
                      .stream()>>>(grad_merge_data, grad_merge->rows().data(),
-                                   lr, param_data,
-                                   moment_data, grad_width, epsilon);
+                                   lr, param_data, moment_data, grad_width,
+                                   epsilon);
  }
 };


--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -127,4 +127,5 @@ paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
 REGISTER_OP_CPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -17,4 +17,5 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -31,9 +31,9 @@ class AdamOpKernel : public framework::OpKernel<T> {
    moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
    moment2_out_tensor->mutable_data<T>(ctx.GetPlace());

-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));

    auto param = framework::EigenVector<T>::Flatten(
        *ctx.Input<framework::Tensor>("Param"));

--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -126,4 +126,5 @@ division by 0 error.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
 REGISTER_OP_CPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -17,4 +17,5 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -31,9 +31,9 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());

-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));

    auto param = framework::EigenVector<T>::Flatten(
        *ctx.Input<framework::Tensor>("Param"));

--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -139,7 +139,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
  items->reserve(framework::product(ids.dims()));
  for (size_t offset = abs_lod[lod_level_][sent_offset_];
       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (int d = 0; d < instance_dim; d++) {
+    for (size_t d = 0; d < instance_dim; d++) {
      const size_t dim_offset = offset * instance_dim + d;
      items->emplace_back(offset, ids_data[dim_offset],
                          scores_data[dim_offset]);

--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -114,18 +114,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(sigmoid)
        .InEnum({identity, sigmoid, tanh, relu});
    AddComment(R"DOC(
-GRUUnit Operator.
-
-This operator implements partial calculations of the GRU unit as follows:
+GRUUnit Operator implements partial calculations of the GRU unit as following:

 $$
-update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r)  \\
-output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\
-output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev})
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
 $$

-The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.

 )DOC");
  }
@@ -150,12 +151,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
                   "ResetHiddenPrev");
    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Gate");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
                   "Hidden");

--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -110,7 +110,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
    auto c = g.slice(c_offsets, extents);  // output candidate

    // calculate final output
-    h.device(place) = u * (h_p - c) + c;
+    h.device(place) = u * (c - h_p) + h_p;
  }
 };

@@ -146,35 +146,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    auto* weight_grad =
        context.Output<Tensor>(framework::GradVarName("Weight"));
    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-    input_grad->mutable_data<T>(context.GetPlace());
-    hidden_prev_grad->mutable_data<T>(context.GetPlace());
-    weight_grad->mutable_data<T>(context.GetPlace());
    Tensor gate_grad;
-    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
    Tensor reset_hidden_prev_grad;
-    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
-                                           context.GetPlace());
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];

    const T* hidden_prev_data = hidden_prev->data<T>();
-    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
    const T* weight_data = weight->data<T>();
-    T* weight_grad_data = weight_grad->data<T>();
-    T* gate_grad_data = gate_grad.data<T>();
+    T* gate_grad_data =
+        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
+        reset_hidden_prev->dims(), context.GetPlace());

    auto h_p = EigenMatrix<T>::From(*hidden_prev);
    auto g = EigenMatrix<T>::From(*gate);
    auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_x = EigenMatrix<T>::From(*input_grad);
-    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
    auto d_g = EigenMatrix<T>::From(gate_grad);
    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
    auto place = context.GetEigenDevice<Place>();

+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
    Eigen::array<int, 2> extents({{batch_size, frame_size}});
    Eigen::array<int, 2> u_offsets({{0, 0}});
    auto u = g.slice(u_offsets, extents);  // update gate
@@ -185,38 +177,52 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {

    // backward for unactivated update gate
    ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
    // backward for unactivated output candidate
    ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+                   d_g.slice(c_offsets, extents), d_h * u);
    // backward for reset_hidden_prev
    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
                         frame_size, frame_size, 1,
                         gate_grad_data + frame_size * 2, frame_size * 3,
                         weight_data + frame_size * frame_size * 2, frame_size,
                         0, reset_hidden_prev_grad_data, frame_size);
-    // backward for state_weight
-    math::gemm<Place, T>(
-        context.device_context(), true, false, frame_size, frame_size,
-        batch_size, 1, reset_hidden_prev_data, frame_size,
-        gate_grad_data + frame_size * 2, frame_size * 3, 0,
-        weight_grad_data + frame_size * frame_size * 2, frame_size);
    // backward for unactivated reset gate
    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
-    // backward for update_gate_weight and reset_gate_weight
-    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
-                         frame_size * 2, batch_size, 1, hidden_prev_data,
-                         frame_size, gate_grad_data, frame_size * 3, 0,
-                         weight_grad_data, frame_size * 2);
+    // backward for weight
+    if (weight_grad) {
+      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
+      // backward for state_weight
+      math::gemm<Place, T>(
+          context.device_context(), true, false, frame_size, frame_size,
+          batch_size, 1, reset_hidden_prev_data, frame_size,
+          gate_grad_data + frame_size * 2, frame_size * 3, 0,
+          weight_grad_data + frame_size * frame_size * 2, frame_size);
+
+      // backward for update_gate_weight and reset_gate_weight
+      math::gemm<Place, T>(context.device_context(), true, false, frame_size,
+                           frame_size * 2, batch_size, 1, hidden_prev_data,
+                           frame_size, gate_grad_data, frame_size * 3, 0,
+                           weight_grad_data, frame_size * 2);
+    }
    // backward for hidden_prev
-    d_h_p.device(place) = d_r_h_p * r + d_h * u;
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size * 2, 1, gate_grad_data,
-                         frame_size * 3, weight_data, frame_size * 2, 1,
-                         hidden_prev_grad_data, frame_size);
+    if (hidden_prev_grad) {
+      T* hidden_prev_grad_data =
+          hidden_prev_grad->mutable_data<T>(context.GetPlace());
+      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
+      math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                           frame_size, frame_size * 2, 1, gate_grad_data,
+                           frame_size * 3, weight_data, frame_size * 2, 1,
+                           hidden_prev_grad_data, frame_size);
+    }
    // backward for input
-    d_x.device(place) = d_g;
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto d_x = EigenMatrix<T>::From(*input_grad);
+      d_x.device(place) = d_g;
+    }
    // backward for bias
    if (bias_grad) {
      bias_grad->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -271,7 +271,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
    ll -= std::log(sum);
    // Now ll is equal to -log(Z).

-    const int* lbl = label.data<int>();
+    const int64_t* lbl = label.data<int64_t>();
    PADDLE_ENFORCE_LT(
        static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
        "An invalid tag label that execesses the largest tag number.");
@@ -449,7 +449,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                           Tensor* emission_grad) const {
    const T* w_exps = transition_exps.data<T>();
    const T* x_exps = emission_exps.data<T>();
-    const int* label_value = label.data<int>();
+    const int64_t* label_value = label.data<int64_t>();
    T* beta_value = beta->data<T>();

    auto x_dims = emission_exps.dims();

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -14,6 +14,7 @@ if(WITH_GPU)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
+    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -26,6 +27,7 @@ else()
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
+    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
 endif()

 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// All tensors are in NCHW format, and the groups must be greater than 1
+template <typename T>
+class MaxOutFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor * output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    int fea_size = input_height * input_width;
+    // c_size means the output size of each sample
+    int c_size = fea_size * output_channels;
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int new_bindex =  c_size * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int new_cindex = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          T ele = static_cast<T>(-FLT_MAX);
+          for (int ph = 0; ph < groups; ++ph) {
+            T x = input_data[(new_bindex + new_cindex) * groups
+              + ph * fea_size + f];
+            ele = ele > x ? ele : x;
+          }
+          output_data[(new_bindex+new_cindex+f)] = ele;
+        }
+      }
+    }
+  }
+};
+
+
+
+template <class T>
+class MaxOutGradFunctor<platform::CPUPlace, T> {
+public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor * input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    int fea_size = input_height * input_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int blen = fea_size * output_channels * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int clen = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          int input_idx0 = (blen + clen) * groups + f;
+          bool continue_match = true;
+          int output_idx = blen + clen + f;
+          for (int g = 0; g < groups && continue_match; ++g) {
+              int input_idx = input_idx0 + fea_size * g;
+              if (input_data[input_idx] == output_data[output_idx]) {
+                input_grad_data[input_idx] += output_grad_data[output_idx];
+                continue_match = false;
+              }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class MaxOutGradFunctor<platform::CPUPlace, float>;
+template class MaxOutGradFunctor<platform::CPUPlace, double>;
+template class MaxOutFunctor<platform::CPUPlace, float>;
+template class MaxOutFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void KernelMaxOut(const int nthreads, const T* input_data,
+                            const int channels,
+                             const int input_height, const int input_width,
+                             int groups, T* output_data ) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+      (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    T ele = static_cast<T>(-FLT_MAX);
+    for (int g = 0; g < groups; ++g) {
+      T x = input_data[data_idx + g * feat_len];
+      ele = ele > x ? ele : x;
+    }
+    output_data[i] = ele;
+  }
+}
+template <typename T>
+__global__ void KernelMaxoutGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, int groups) {
+    const int size = input_height * input_width * channels / groups;
+    const int feat_len = input_height * input_width;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int offset = blockDim.x * gridDim.x;
+    for (int i = index; i < nthreads; i += offset) {
+      int batch_idx = i / size;
+      int batch_offset = i % size;
+      int channel_idx = batch_offset / feat_len;
+      int feat_idx = batch_offset % feat_len;
+      int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+      int max_index = -1;
+      bool continue_match = true;
+      for (int g = 0; g < groups && continue_match; ++g) {
+        if (input_data[data_idx + g * feat_len] == output_data[i]) {
+          max_index = data_idx + g * feat_len;
+          continue_match = false;
+          break;
+        }
+      }
+      if (max_index != -1) {
+        input_grad[max_index] += output_grad[index];
+      }
+    }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor * output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads =  output->numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxOut<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, input_channels,
+                              input_height, input_width, groups,
+                              output_data);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor * input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads =  output.numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxoutGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, groups);
+  }
+};
+
+template class MaxOutGradFunctor<platform::GPUPlace, float>;
+template class MaxOutGradFunctor<platform::GPUPlace, double>;
+
+template class MaxOutFunctor<platform::GPUPlace, float>;
+template class MaxOutFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX \
+    __FLT_MAX__
+
+template <typename Place, typename T>
+
+class MaxOutFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor * output,
+                  int groups);
+};
+
+template <typename Place, class T>
+class MaxOutGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  framework::Tensor * input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/operators/maxout_op.h"
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+        "(Tensor) The input tensor of maxout operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+        "(Tensor) The output tensor of maxout operator."
+        "The format of output tensor is also NCHW."
+        "Where N is batch size, C is "
+        "the number of channels, H and W is the height and "
+        "width of feature.");
+    AddAttr<int>(
+        "groups",
+        R"DOC("Specifies how many groups the input tensor will be split"
+        "in the channel dimension. And the number of output channel is "
+        "the number of channels divided by groups.."
+        )DOC");
+    AddComment(R"DOC(
+        Assumed the input shape is (N, Ci, H, W).
+        The output shape is (N, Co, H, W). Then `Co = Ci / groups`.
+
+       math:
+       y_{si+j} = \max_k x_{gsi + sk + j}
+       g = groups
+       s = input.size / num_channels
+       0 \le i < num_channels / groups
+       0 \le j < s
+       0 \le k < groups
+
+    Please refer to Paper:
+      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+      - Multi-digit Number Recognition from Street View \
+        Imagery using Deep Convolutional Neural Networks: \
+        https://arxiv.org/pdf/1312.6082v4.pdf
+        )DOC");
+  }
+};
+
+
+class MaxOutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MaxoutOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MaxoutOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int groups = ctx->Attrs().Get<int>("groups");
+    // check groups > 1
+    PADDLE_ENFORCE_GT(
+        groups, 1,
+        "groups should be larger than 1 in maxoutop");
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
+    output_shape.push_back(in_x_dims[2]);
+    output_shape.push_back(in_x_dims[3]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class MaxOutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+    "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}    // namespace operators
+}    // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
+                        ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
+                       float>);
+REGISTER_OP_CPU_KERNEL(maxout_grad,
+                       ops::MaxOutGradKernel<paddle::platform::CPUPlace,
+                       float>);
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/maxout_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
+                       ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(maxout_grad,
+                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
+                        float>,
+                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
+                        double>);
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class MaxOutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    int groups = context.template Attr<int>("groups");
+
+    math::MaxOutFunctor<Place, T> maxout_forward;
+    maxout_forward(context.device_context(), *in_x, out, groups);
+  }
+};
+
+template <typename Place, typename T>
+class MaxOutGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int groups = context.template Attr<int>("groups");
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
+      math::MaxOutGradFunctor<Place, T> maxout_backward;
+      maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
+        *out_grad, groups);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -179,7 +179,9 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
            sequence_conv_grad, ops::SequenceConvGradOp);

 REGISTER_OP_CPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>);
+    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -16,7 +16,9 @@

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>);
+    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>,
+    ops::SequenceConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -144,7 +144,7 @@ function gen_dockerfile() {
    DOCKERFILE_GPU_ENV=""
    DOCKERFILE_CUDNN_DSO=""
    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
    fi


--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -138,7 +138,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
  }

  if (FLAGS_use_mkldnn) {
-    CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer";
+    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
  }

  if (testing) {

--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -11,7 +11,6 @@ add_unittest_without_exec(test_Trainer
    test_Trainer.cpp)
 add_test(NAME test_Trainer
  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/gen_proto_data.py &&
        ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
        ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

--- a/paddle/trainer/tests/chunking.conf
+++ b/paddle/trainer/tests/chunking.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-TrainData(ProtoData(
-  files = 'trainer/tests/train_files.txt',
-  usage_ratio = 1.0,
-))
-
-TestData(ProtoData(
-  files = 'trainer/tests/test_files.txt'
-))
-
-default_initial_std(1)
-default_decay_rate(4e-4)
-default_device(0)
-
-Inputs("features", "word", "pos", "chunk")
-
-Outputs("crf")
-
-Layer(
-    name = "features",
-    type = "data",
-    size = 4339,
-)
-
-Layer(
-    name = "word",
-    type = "data",
-    size = 478,
-)
-
-Layer(
-    name = "pos",
-    type = "data",
-    size = 45
-)
-
-Layer(
-    name = "chunk",
-    type = "data",
-    size = 23
-)
-
-Layer(
-    name = "output",
-    type = "mixed",
-    size = 23,
-    bias = False,
-    device = -1,
-    inputs = [
-        FullMatrixProjection("features", parameter_name="feature_weights"),
-    #    TableProjection("word"),
-    #    TableProjection("pos"),
-    ],
-)
-
-Layer(
-    name = "crf",
-    type = "crf",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Layer(
-    name = "crf_decoding",
-    type = "crf_decoding",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Evaluator(
-    name = "error",
-    type = "sum",
-    inputs = "crf_decoding",
-)
-
-'''
-# chuck evaluator cannot be used for GPU training
-Evaluator(
-    name = "chunk_f1",
-    type = "chunk",
-    inputs = ["crf_decoding", "chunk"],
-    chunk_scheme = "IOB",
-    num_chunk_types = 11,
-)
-'''
-
-Settings(
-    algorithm = 'sgd',
-    batch_size = 100,
-    average_window = 0.5,
-    max_average_window = 2500,
-    learning_rate = 1e-1,
-    learning_rate_decay_a = 5e-7,
-    learning_rate_decay_b = 0.75,
-    l1weight = 0,
-    l2weight = 1,
-    c1 = 0.0001,
-    backoff = 0.5,
-    owlqn_steps = 100,
-    max_backoff = 5,
-)
--- a/paddle/trainer/tests/compare_sparse_data
+++ b/paddle/trainer/tests/compare_sparse_data
--- a/paddle/trainer/tests/data_bin_part
+++ b/paddle/trainer/tests/data_bin_part
--- a/paddle/trainer/tests/gen_proto_data.py
+++ b/paddle/trainer/tests/gen_proto_data.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cStringIO import StringIO
-
-import paddle.proto.DataFormat_pb2 as DataFormat
-from google.protobuf.internal.encoder import _EncodeVarint
-
-import logging
-import pprint
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-
-OOV_POLICY_IGNORE = 0
-OOV_POLICY_USE = 1
-OOV_POLICY_ERROR = 2
-
-num_original_columns = 3
-
-# Feature combination patterns.
-# [[-1,0], [0,0]]  means previous token at column 0 and current token at
-# column 0 are combined as one feature.
-patterns = [
-    [[-2, 0]],
-    [[-1, 0]],
-    [[0, 0]],
-    [[1, 0]],
-    [[2, 0]],
-    [[-1, 0], [0, 0]],
-    [[0, 0], [1, 0]],
-    [[-2, 1]],
-    [[-1, 1]],
-    [[0, 1]],
-    [[1, 1]],
-    [[2, 1]],
-    [[-2, 1], [-1, 1]],
-    [[-1, 1], [0, 1]],
-    [[0, 1], [1, 1]],
-    [[1, 1], [2, 1]],
-    [[-2, 1], [-1, 1], [0, 1]],
-    [[-1, 1], [0, 1], [1, 1]],
-    [[0, 1], [1, 1], [2, 1]],
-]
-
-
-def make_features(sequence):
-    length = len(sequence)
-    num_features = len(sequence[0])
-
-    def get_features(pos):
-        if pos < 0:
-            return ['#B%s' % -pos] * num_features
-        if pos >= length:
-            return ['#E%s' % (pos - length + 1)] * num_features
-        return sequence[pos]
-
-    for i in xrange(length):
-        for pattern in patterns:
-            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
-            sequence[i].append(fname)
-
-
-'''
-Source file format:
-Each line is for one timestep. The features are separated by space.
-An empty line indicates end of a sequence.
-
-cutoff: a list of numbers. If count of a feature is smaller than this,
- it will be ignored.
-if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
-i-th column.
-
-return a list of dict for each column
-'''
-
-
-def create_dictionaries(filename, cutoff, oov_policy):
-    def add_to_dict(sequence, dicts):
-        num_features = len(dicts)
-        for features in sequence:
-            l = len(features)
-            assert l == num_features, "Wrong number of features " + line
-            for i in xrange(l):
-                if features[i] in dicts[i]:
-                    dicts[i][features[i]] += 1
-                else:
-                    dicts[i][features[i]] = 1
-
-    num_features = len(cutoff)
-    dicts = []
-    for i in xrange(num_features):
-        dicts.append(dict())
-
-    f = open(filename, 'rb')
-
-    sequence = []
-
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            add_to_dict(sequence, dicts)
-            sequence = []
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    for i in xrange(num_features):
-        dct = dicts[i]
-        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
-        todo = []
-        for k, v in dct.iteritems():
-            if v < cutoff[i]:
-                todo.append(k)
-            else:
-                dct[k] = n
-                n += 1
-
-        if oov_policy[i] == OOV_POLICY_USE:
-            # placeholder so that len(dct) will be the number of features
-            # including OOV
-            dct['#OOV#'] = 0
-
-        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
-        for k in todo:
-            del dct[k]
-
-    f.close()
-    return dicts
-
-
-def encode_varint(v):
-    out = StringIO()
-    _EncodeVarint(out.write, v)
-    return out.getvalue()
-
-
-def write_proto(file, message):
-    s = message.SerializeToString()
-    packed_len = encode_varint(len(s))
-    file.write(packed_len + s)
-
-
-'''
-if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
-existed in dicts[i] will be assigned to id 0.
-if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
-in dicts[i].
-'''
-
-
-def gen_proto_file(input_file, dicts, oov_policy, output_file):
-    def write_sequence(out, sequence):
-        num_features = len(dicts)
-        is_beginning = True
-        for features in sequence:
-            assert len(features) == num_features, \
-                "Wrong number of features: " + line
-            sample = DataFormat.DataSample()
-            for i in xrange(num_original_columns):
-                id = dicts[i].get(features[i], -1)
-                if id != -1:
-                    sample.id_slots.append(id)
-                elif oov_policy[i] == OOV_POLICY_IGNORE:
-                    sample.id_slots.append(0xffffffff)
-                elif oov_policy[i] == OOV_POLICY_ERROR:
-                    logger.fatal("Unknown token: %s" % features[i])
-                else:
-                    sample.id_slots.append(0)
-
-            if patterns:
-                dim = 0
-                vec = sample.vector_slots.add()
-                for i in xrange(num_original_columns, num_features):
-                    id = dicts[i].get(features[i], -1)
-                    if id != -1:
-                        vec.ids.append(dim + id)
-                    elif oov_policy[i] == OOV_POLICY_IGNORE:
-                        pass
-                    elif oov_policy[i] == OOV_POLICY_ERROR:
-                        logger.fatal("Unknown token: %s" % features[i])
-                    else:
-                        vec.ids.append(dim + 0)
-
-                    dim += len(dicts[i])
-
-            sample.is_beginning = is_beginning
-            is_beginning = False
-            write_proto(out, sample)
-
-    num_features = len(dicts)
-    f = open(input_file, 'rb')
-    out = open(output_file, 'wb')
-
-    header = DataFormat.DataHeader()
-    if patterns:
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
-        slot_def.dim = sum(
-            [len(dicts[i]) for i in xrange(num_original_columns, len(dicts))])
-        logger.info("feature_dim=%s" % slot_def.dim)
-
-    for i in xrange(num_original_columns):
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.INDEX
-        slot_def.dim = len(dicts[i])
-
-    write_proto(out, header)
-
-    num_sequences = 0
-    sequence = []
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            write_sequence(out, sequence)
-            sequence = []
-            num_sequences += 1
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    f.close()
-    out.close()
-
-    logger.info("num_sequences=%s" % num_sequences)
-
-
-dict2 = {
-    'B-ADJP': 0,
-    'I-ADJP': 1,
-    'B-ADVP': 2,
-    'I-ADVP': 3,
-    'B-CONJP': 4,
-    'I-CONJP': 5,
-    'B-INTJ': 6,
-    'I-INTJ': 7,
-    'B-LST': 8,
-    'I-LST': 9,
-    'B-NP': 10,
-    'I-NP': 11,
-    'B-PP': 12,
-    'I-PP': 13,
-    'B-PRT': 14,
-    'I-PRT': 15,
-    'B-SBAR': 16,
-    'I-SBAR': 17,
-    'B-UCP': 18,
-    'I-UCP': 19,
-    'B-VP': 20,
-    'I-VP': 21,
-    'O': 22
-}
-
-if __name__ == '__main__':
-    cutoff = [3, 1, 0]
-    cutoff += [3] * len(patterns)
-    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
-    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
-    dicts = create_dictionaries('trainer/tests/train.txt', cutoff, oov_policy)
-    dicts[2] = dict2
-    gen_proto_file('trainer/tests/train.txt', dicts, oov_policy,
-                   'trainer/tests/train_proto.bin')
-    gen_proto_file('trainer/tests/test.txt', dicts, oov_policy,
-                   'trainer/tests/test_proto.bin')
--- a/paddle/trainer/tests/test.txt
+++ b/paddle/trainer/tests/test.txt
-Confidence NN B-NP
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-is VBZ B-VP
-widely RB I-VP
-expected VBN I-VP
-to TO I-VP
-take VB I-VP
-another DT B-NP
-sharp JJ I-NP
-dive NN I-NP
-if IN B-SBAR
-trade NN B-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-, , O
-due JJ B-ADJP
-for IN B-PP
-release NN B-NP
-tomorrow NN B-NP
-, , O
-fail VB B-VP
-to TO I-VP
-show VB I-VP
-a DT B-NP
-substantial JJ I-NP
-improvement NN I-NP
-from IN B-PP
-July NNP B-NP
-and CC I-NP
-August NNP I-NP
-'s POS B-NP
-near-record JJ I-NP
-deficits NNS I-NP
-. . O
-
-Chancellor NNP O
-of IN B-PP
-the DT B-NP
-Exchequer NNP I-NP
-Nigel NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-restated VBN I-NP
-commitment NN I-NP
-to TO B-PP
-a DT B-NP
-firm NN I-NP
-monetary JJ I-NP
-policy NN I-NP
-has VBZ B-VP
-helped VBN I-VP
-to TO I-VP
-prevent VB I-VP
-a DT B-NP
-freefall NN I-NP
-in IN B-PP
-sterling NN B-NP
-over IN B-PP
-the DT B-NP
-past JJ I-NP
-week NN I-NP
-. . O
-
-But CC O
-analysts NNS B-NP
-reckon VBP B-VP
-underlying VBG B-NP
-support NN I-NP
-for IN B-PP
-sterling NN B-NP
-has VBZ B-VP
-been VBN I-VP
-eroded VBN I-VP
-by IN B-PP
-the DT B-NP
-chancellor NN I-NP
-'s POS B-NP
-failure NN I-NP
-to TO B-VP
-announce VB I-VP
-any DT B-NP
-new JJ I-NP
-policy NN I-NP
-measures NNS I-NP
-in IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-last JJ B-NP
-Thursday NNP I-NP
-. . O
-
-This DT B-NP
-has VBZ B-VP
-increased VBN I-VP
-the DT B-NP
-risk NN I-NP
-of IN B-PP
-the DT B-NP
-government NN I-NP
-being VBG B-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-base NN B-NP
-rates NNS I-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-from IN B-PP
-their PRP$ B-NP
-current JJ I-NP
-15 CD I-NP
-% NN I-NP
-level NN I-NP
-to TO B-VP
-defend VB I-VP
-the DT B-NP
-pound NN I-NP
-, , O
-economists NNS B-NP
-and CC O
-foreign JJ B-NP
-exchange NN I-NP
-market NN I-NP
-analysts NNS I-NP
-say VBP B-VP
-. . O
-
-`` `` O
-The DT B-NP
-risks NNS I-NP
-for IN B-PP
-sterling NN B-NP
-of IN B-PP
-a DT B-NP
-bad JJ I-NP
-trade NN I-NP
-figure NN I-NP
-are VBP B-VP
-very RB B-ADVP
-heavily RB I-ADVP
-on IN B-PP
-the DT B-NP
-down JJ I-NP
-side NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Chris NNP B-NP
-Dillow NNP I-NP
-, , O
-senior JJ B-NP
-U.K. NNP I-NP
-economist NN I-NP
-at IN B-PP
-Nomura NNP B-NP
-Research NNP I-NP
-Institute NNP I-NP
-. . O
-
-`` `` O
-If IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-another DT B-NP
-bad JJ I-NP
-trade NN I-NP
-number NN I-NP
-, , O
-there EX B-NP
-could MD B-VP
-be VB I-VP
-an DT B-NP
-awful JJ I-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-, , O
-'' '' O
-noted VBD B-VP
-Simon NNP B-NP
-Briscoe NNP I-NP
-, , O
-U.K. NNP B-NP
-economist NN I-NP
-for IN B-PP
-Midland NNP B-NP
-Montagu NNP I-NP
-, , O
-a DT B-NP
-unit NN I-NP
-of IN B-PP
-Midland NNP B-NP
-Bank NNP I-NP
-PLC NNP I-NP
-. . O
-
-Forecasts NNS B-NP
-for IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-range VBP B-VP
-widely RB B-ADVP
-, , O
-but CC O
-few JJ B-NP
-economists NNS I-NP
-expect VBP B-VP
-the DT B-NP
-data NNS I-NP
-to TO B-VP
-show VB I-VP
-a DT B-NP
-very RB I-NP
-marked VBN I-NP
-improvement NN I-NP
-from IN B-PP
-the DT O
-# # O
-2 CD O
-billion CD O
-LRB- ( O
-$ $ B-ADJP
-3.2 CD O
-billion CD O
-RRB- ) O
-deficit NN B-NP
-in IN B-PP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-reported VBD B-VP
-for IN B-PP
-August NNP B-NP
-. . O
-
-The DT B-NP
-August NNP I-NP
-deficit NN I-NP
-and CC O
-the DT B-NP
-# # I-NP
-2.2 CD I-NP
-billion CD I-NP
-gap NN I-NP
-registered VBN B-VP
-in IN B-PP
-July NNP B-NP
-are VBP B-VP
-topped VBN I-VP
-only RB B-ADVP
-by IN B-PP
-the DT B-NP
-# # I-NP
-2.3 CD I-NP
-billion CD I-NP
-deficit NN I-NP
-of IN B-PP
-October NNP B-NP
-1988 CD I-NP
-. . O
-
-Sanjay NNP B-NP
-Joshi NNP I-NP
-, , O
-European JJ B-NP
-economist NN I-NP
-at IN B-PP
-Baring NNP B-NP
-Brothers NNPS I-NP
-& CC I-NP
-Co. NNP I-NP
-, , O
-said VBD B-VP
-there EX B-NP
-is VBZ B-VP
-no DT B-NP
-sign NN I-NP
-that IN B-SBAR
-Britain NNP B-NP
-'s POS B-NP
-manufacturing NN I-NP
-industry NN I-NP
-is VBZ B-VP
-transforming VBG I-VP
-itself PRP B-NP
-to TO B-VP
-boost VB I-VP
-exports NNS B-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-same JJ I-NP
-time NN I-NP
-, , O
-he PRP B-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-pessimistic JJ I-ADJP
-about IN B-PP
-the DT B-NP
-outlook NN I-NP
-for IN B-PP
-imports NNS B-NP
-, , O
-given VBN B-PP
-continued VBD B-NP
-high JJ I-NP
-consumer NN I-NP
-and CC I-NP
-capital NN I-NP
-goods NNS I-NP
-inflows NNS I-NP
-. . O
-
-He PRP B-NP
-reckons VBZ B-VP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-deficit NN I-NP
-will MD B-VP
-narrow VB I-VP
-to TO B-PP
-only RB B-NP
-# # I-NP
-1.8 CD I-NP
-billion CD I-NP
-in IN B-PP
-September NNP B-NP
-. . O
-
-However RB B-ADVP
-, , O
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-he PRP B-NP
-believes VBZ B-VP
-that IN B-SBAR
-a DT B-NP
-reduction NN I-NP
-in IN B-PP
-raw JJ B-NP
-material NN I-NP
-stockbuilding VBG I-NP
-by IN B-PP
-industry NN B-NP
-could MD B-VP
-lead VB I-VP
-to TO B-PP
-a DT B-NP
-sharp JJ I-NP
-drop NN I-NP
-in IN B-PP
-imports NNS B-NP
-. . O
-
-Combined VBN B-PP
-with IN B-PP
-at IN B-ADVP
-least JJS I-ADVP
-some DT B-NP
-rebound NN I-NP
-in IN B-PP
-exports NNS B-NP
-after IN B-PP
-August NNP B-NP
-'s POS B-NP
-unexpected JJ I-NP
-decline NN I-NP
-, , O
-the DT B-NP
-deficit NN I-NP
-could MD B-VP
-narrow VB I-VP
-to TO B-PP
-as RB B-NP
-little JJ I-NP
-as IN I-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-. . O
-
-Mr. NNP B-NP
-Briscoe NNP I-NP
-, , O
-who WP B-NP
-also RB B-ADVP
-forecasts VBZ B-VP
-a DT B-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-current JJ I-NP
-account NN I-NP
-gap NN I-NP
-, , O
-warns VBZ B-VP
-that IN B-SBAR
-even RB B-SBAR
-if IN I-SBAR
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-are VBP B-VP
-bullish JJ B-ADJP
-for IN B-PP
-sterling NN B-NP
-, , O
-the DT B-NP
-currency NN I-NP
-wo MD B-VP
-n't RB I-VP
-advance VB I-VP
-much JJ B-NP
-because IN B-SBAR
-investors NNS B-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-see VB I-VP
-further JJ B-NP
-evidence NN I-NP
-of IN B-PP
-the DT B-NP
-turnaround NN I-NP
-before IN B-PP
-adjusting VBG B-VP
-positions NNS B-NP
-. . O
-
-Nevertheless RB B-ADVP
-, , O
-he PRP B-NP
-noted VBD B-VP
-, , O
-`` `` O
-No DT B-NP
-one PRP I-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-go VB I-VP
-into IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-without IN B-PP
-a DT B-NP
-flat JJ I-NP
-position NN I-NP
-'' '' O
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-. . O
-
-Meanwhile RB B-ADVP
-, , O
-overall JJ B-NP
-evidence NN I-NP
-on IN B-PP
-the DT B-NP
-economy NN I-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-clouded VBN I-ADJP
-. . O
-
-In IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-, , O
-Mr. NNP B-NP
-Lawson NNP I-NP
-warned VBD B-VP
-that IN B-SBAR
-a DT B-NP
-further JJ I-NP
-slowdown NN I-NP
-can MD B-VP
-be VB I-VP
-expected VBN I-VP
-as IN B-SBAR
-the DT B-NP
-impact NN I-NP
-of IN B-PP
-the DT B-NP
-last JJ I-NP
-rise NN I-NP
-in IN B-PP
-interest NN B-NP
-rates NNS I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-takes VBZ B-VP
-effect NN B-NP
-. . O
-
-U.K. JJ B-NP
-base NN I-NP
-rates NNS I-NP
-are VBP B-VP
-at IN B-PP
-their PRP$ B-NP
-highest JJS I-NP
-level NN I-NP
-in IN B-PP
-eight CD B-NP
-years NNS I-NP
-. . O
-
-But CC O
-consumer NN B-NP
-expenditure NN I-NP
-data NNS I-NP
-released VBD B-VP
-Friday NNP B-NP
-do VBP B-VP
-n't RB I-VP
-suggest VB I-VP
-that IN B-SBAR
-the DT B-NP
-U.K. NNP I-NP
-economy NN I-NP
-is VBZ B-VP
-slowing VBG I-VP
-that DT B-ADVP
-quickly RB I-ADVP
-. . O
-
-The DT B-NP
-figures NNS I-NP
-show VBP B-VP
-that DT O
-spending NN B-NP
-rose VBD B-VP
-0.1 CD B-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-third JJ I-NP
-quarter NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-and CC O
-was VBD B-VP
-up IN B-ADVP
-3.8 CD B-NP
-% NN I-NP
-from IN B-PP
-a DT B-NP
-year NN I-NP
-ago RB B-ADVP
-. . O
-
-This DT B-NP
-compares VBZ B-VP
-with IN B-PP
-a DT B-NP
-1.6 CD I-NP
-% NN I-NP
-rise NN I-NP
-in IN B-PP
-the DT B-NP
-second NN I-NP
-from IN B-PP
-the DT B-NP
-first JJ I-NP
-quarter NN I-NP
-and CC O
-a DT B-NP
-5.4 CD I-NP
-% NN I-NP
-increase NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-of IN B-PP
-1988 CD B-NP
-. . O
-
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-the DT B-NP
-data NNS I-NP
-show VBP B-VP
-the DT B-NP
-economy NN I-NP
-`` `` O
-is VBZ B-VP
-still RB B-ADVP
-quite RB B-ADJP
-strong JJ I-ADJP
-, , O
-'' '' O
-but CC O
-suggestions NNS B-NP
-that IN B-SBAR
-much NN B-NP
-of IN B-PP
-the DT B-NP
-spending NN I-NP
-went VBD B-VP
-on IN B-PP
-services NNS B-NP
-rather RB B-PP
-than IN I-PP
-consumer NN B-NP
-goods NNS I-NP
-should MD B-VP
-reduce VB I-VP
-fears NNS B-NP
-of IN B-PP
-more JJR B-NP
-import NN I-NP
-rises NNS I-NP
-. . O
-
-Certainly RB B-ADVP
-, , O
-the DT B-NP
-chancellor NN I-NP
-has VBZ B-VP
-made VBN I-VP
-it PRP B-NP
-clear JJ B-ADJP
-that IN B-SBAR
-he PRP B-NP
-is VBZ B-VP
-prepared VBN I-VP
-to TO I-VP
-increase VB I-VP
-interest NN B-NP
-rates NNS I-NP
-again RB B-ADVP
-if IN B-SBAR
-necessary JJ B-ADJP
-to TO B-VP
-both DT I-VP
-ensure VB I-VP
-that IN B-SBAR
-a DT B-NP
-substantial JJ I-NP
-slowdown NN I-NP
-does VBZ B-VP
-take VB I-VP
-place NN B-NP
-and CC O
-that DT O
-sterling NN B-NP
-does VBZ B-VP
-n't RB I-VP
-decline VB I-VP
-further JJ B-ADVP
-. . O
-
-Thursday NNP B-NP
-, , O
-he PRP B-NP
-reminded VBD B-VP
-his PRP$ B-NP
-audience NN I-NP
-that IN B-SBAR
-the DT B-NP
-government NN I-NP
-`` `` O
-can MD B-VP
-not RB I-VP
-allow VB I-VP
-the DT B-NP
-necessary JJ I-NP
-rigor NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-to TO B-VP
-be VB I-VP
-undermined VBN I-VP
-by IN B-PP
-exchange NN B-NP
-rate NN I-NP
-weakness NN I-NP
-. . O
-'' '' O
-
-Analysts NNS B-NP
-agree VBP B-VP
-there EX B-NP
-is VBZ B-VP
-little JJ B-NP
-holding NN B-VP
-sterling NN B-NP
-firm NN B-ADJP
-at IN B-PP
-the DT B-NP
-moment NN I-NP
-other JJ B-ADJP
-than IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-that IN B-SBAR
-rates NNS B-NP
-will MD B-VP
-be VB I-VP
-pushed VBN I-VP
-higher JJR B-ADJP
-if IN B-SBAR
-necessary JJ B-ADJP
-. . O
-
-And CC O
-, , O
-they PRP B-NP
-warn VBP B-VP
-, , O
-any DT B-NP
-further JJ I-NP
-drop NN I-NP
-in IN B-PP
-the DT B-NP
-government NN I-NP
-'s POS B-NP
-popularity NN I-NP
-could MD B-VP
-swiftly RB I-VP
-make VB I-VP
-this DT B-NP
-promise NN I-NP
-sound NN B-VP
-hollow JJ B-ADJP
-. . O
-
-Sterling NNP B-NP
-was VBD B-VP
-already RB I-VP
-showing VBG I-VP
-some DT B-NP
-signs NNS I-NP
-of IN B-PP
-a DT B-NP
-lack NN I-NP
-of IN B-PP
-confidence NN B-NP
-in IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-Friday NNP B-NP
-. . O
-
-In IN B-PP
-European JJ B-NP
-trading NN I-NP
-it PRP B-NP
-declined VBD B-VP
-to TO B-PP
-$ $ B-NP
-1.5890 CD I-NP
-and CC O
-2.9495 CD B-NP
-marks NNS I-NP
-from IN B-PP
-$ $ B-NP
-1.5940 CD I-NP
-and CC O
-2.9429 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-Economists NNS B-NP
-suggested VBD B-VP
-that IN B-SBAR
-if IN B-SBAR
-the DT B-NP
-pound NN I-NP
-falls VBZ B-VP
-much JJ B-NP
-below IN B-PP
-2.90 CD B-NP
-marks NNS I-NP
-, , O
-the DT B-NP
-government NN I-NP
-will MD B-VP
-be VB I-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-rates NNS B-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-, , O
-both DT B-VP
-to TO I-VP
-halt VB B-VP
-any DT B-NP
-further JJ I-NP
-decline NN I-NP
-and CC O
-ensure VB B-VP
-that IN B-SBAR
-the DT B-NP
-balance NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-remains VBZ B-VP
-unchanged JJ B-ADJP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-Market NNP I-NP
-Activity NN I-NP
-
-The DT B-NP
-dollar NN I-NP
-posted VBD B-VP
-gains NNS B-NP
-in IN B-PP
-quiet JJ B-NP
-trading NN I-NP
-as IN B-SBAR
-concerns NNS B-NP
-about IN B-PP
-equities NNS B-NP
-abated VBN B-VP
-. . O
-
-Foreign JJ B-NP
-exchange NN I-NP
-dealers NNS I-NP
-said VBD B-VP
-that IN B-SBAR
-the DT B-NP
-currency NN I-NP
-market NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-distance VB I-VP
-itself PRP B-NP
-from IN B-PP
-the DT B-NP
-volatile JJ I-NP
-stock NN I-NP
-exchange NN I-NP
-, , O
-which WDT B-NP
-has VBZ B-VP
-preoccupied VBN I-VP
-the DT B-NP
-market NN I-NP
-since IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-, , O
-when WRB B-ADVP
-the DT B-NP
-Dow NNP I-NP
-Jones NNP I-NP
-Industrial NNP I-NP
-Average NNP I-NP
-plunged VBD B-VP
-more JJR B-NP
-than IN I-NP
-190 CD I-NP
-points NNS I-NP
-. . O
-
-Currency NN B-NP
-analysts NNS I-NP
-predict VBP B-VP
-that IN B-SBAR
-in IN B-PP
-the DT B-NP
-coming VBG I-NP
-week NN I-NP
-the DT B-NP
-foreign JJ I-NP
-exchange NN I-NP
-market NN I-NP
-will MD B-VP
-shift VB I-VP
-its PRP$ B-NP
-focus NN I-NP
-back RB B-ADVP
-to TO B-PP
-economic JJ B-NP
-fundamentals NNS I-NP
-, , O
-keeping VBG B-VP
-a DT B-NP
-close NN I-NP
-eye NN I-NP
-out IN B-ADVP
-for IN B-PP
-any DT B-NP
-signs NNS I-NP
-of IN B-PP
-monetary JJ B-NP
-easing NN I-NP
-by IN B-PP
-U.S. NNP B-NP
-Federal NNP I-NP
-Reserve NNP I-NP
-. . O
-
-Late RB B-ADVP
-in IN B-PP
-the DT B-NP
-New NNP I-NP
-York NNP I-NP
-trading NN I-NP
-day NN I-NP
-, , O
-the DT B-NP
-dollar NN I-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-1.8578 CD B-NP
-marks NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-1.8470 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-was VBD B-VP
-also RB I-VP
-changing VBG I-VP
-hands NNS B-NP
-at IN B-PP
-142.43 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-141.70 CD B-NP
-yen NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-In IN B-PP
-Tokyo NNP B-NP
-on IN B-PP
-Monday NNP B-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-opened VBD B-VP
-for IN B-PP
-trading NN B-NP
-at IN B-PP
-141.95 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-Tokyo NNP I-NP
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -24,7 +24,6 @@ using namespace std;     // NOLINT
 static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
    "trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
    "trainer/tests/sample_trainer_config_parallel.conf";

@@ -95,13 +94,6 @@ TEST(checkGradient, multi) {

 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }

-TEST(checkGradient, chunk) {
-  checkGradientTest(configFile3, false, false);
-#ifdef PADDLE_WITH_CUDA
-  checkGradientTest(configFile3, true, true);
-#endif
-}
-
 TEST(checkGradient, non_parallel) {
  checkGradientTest(configFile4, false, false);
 }

--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -15,12 +15,7 @@

 from paddle.trainer_config_helpers import *

-TrainData(ProtoData(
-    files = "dummy_list",
-    constant_slots = [1.0],
-    async_load_data = True))
-
-TestData(SimpleData(
+TrainData(SimpleData(
    files = "trainer/tests/sample_filelist.txt",
    feat_dim = 3,
    context_len = 0,

--- a/paddle/trainer/tests/test_files.txt
+++ b/paddle/trainer/tests/test_files.txt
-trainer/tests/test_proto.bin
--- a/paddle/trainer/tests/train.list
+++ b/paddle/trainer/tests/train.list
-trainer/tests/data_bin_part
--- a/paddle/trainer/tests/train.txt
+++ b/paddle/trainer/tests/train.txt
--- a/paddle/trainer/tests/train_files.txt
+++ b/paddle/trainer/tests/train_files.txt
-trainer/tests/train_proto.bin
--- a/paddle/trainer/tests/train_sparse.list
+++ b/paddle/trainer/tests/train_sparse.list
-trainer/tests/compare_sparse_data
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1116,35 +1116,6 @@ def PyData(files=None,
    return data_config


-@config_func
-def ProtoData(files=None,
-              type=None,
-              file_group_queue_capacity=None,
-              load_file_count=None,
-              constant_slots=None,
-              load_thread_num=None,
-              **xargs):
-    data_config = create_data_config_proto(**xargs)
-    if type is None:
-        data_config.type = 'proto'
-    else:
-        data_config.type = type
-    data_config.files = files
-
-    # When type="proto_group", one data provider contains at most
-    # load_file_count files, and there are at most
-    # (queue_capacity + load_thread_num + 1) data providers in memory
-    if file_group_queue_capacity is not None:
-        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
-    if load_file_count is not None:
-        data_config.file_group_conf.load_file_count = load_file_count
-    if load_thread_num is not None:
-        data_config.file_group_conf.load_thread_num = load_thread_num
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    return data_config
-
-
 #real data for training is actually provided by "sub_data" data providers.
 @config_func
 def MultiData(sub_data=[]):
@@ -2066,13 +2037,20 @@ class ParameterReluLayer(LayerBase):
    def __init__(self, name, inputs, partial_sum=1, **args):
        super(ParameterReluLayer, self).__init__(
            name, self.layer_type, 0, inputs=inputs, **args)
+
        input_layer = self.get_input_layer(0)
        config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
        config_assert(input_layer.size % partial_sum == 0,
                      "a wrong setting for partial_sum")
+
+        dims = [1, input_layer.size / partial_sum]
        self.set_layer_size(input_layer.size)
        self.config.partial_sum = partial_sum
-        self.create_input_parameter(0, input_layer.size / partial_sum)
+        self.create_input_parameter(0, input_layer.size / partial_sum, dims)
+
+        self.set_layer_height_width(self.get_input_layer(0).height, \
+                                        self.get_input_layer(0).width)
+        self.set_layer_depth(self.get_input_layer(0).depth)


 @config_layer('conv')
@@ -2714,7 +2692,7 @@ Usage:
             max_sort_size = -1, inputs = ["output", "score"])

  Input data: Samples of the same query should be loaded as a sequence,
-          by ProtoDataProvider or PyDataProvider etc.. User should provide
+          by PyDataProvider etc.. User should provide
          scores for each sample. The score slot should be the 2nd
          input of lambdaRank layer.


--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -17,7 +17,8 @@ __all__ = [
    "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
    'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation", "SqrtActivation", "ReciprocalActivation"
+    "LogActivation", "SqrtActivation", "ReciprocalActivation",
+    "SoftSignActivation"
 ]


@@ -243,8 +244,20 @@ class ReciprocalActivation(BaseActivation):
    Reciprocal Activation.

    .. math::
-       f(z) = 1/z
+       f(z)=\\frac{1}{z}
    """

    def __init__(self):
        BaseActivation.__init__(self, 'reciprocal', False)
+
+
+class SoftSignActivation(BaseActivation):
+    """
+    SoftSign Activation.
+
+    .. math::
+       f(z)=\\frac{z}{1 + |z|}
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'softsign', False)
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -297,7 +297,7 @@ def auc_evaluator(
 def pnpair_evaluator(
        input,
        label,
-        info,
+        query_id,
        weight=None,
        name=None, ):
    """
@@ -308,16 +308,20 @@ def pnpair_evaluator(

    .. code-block:: python

-       eval = pnpair_evaluator(input, label, info)
+       eval = pnpair_evaluator(input, label, query_id)

    :param input: Input Layer name. The output prediction of network.
    :type input: LayerOutput
    :param label: Label layer name.
    :type label: LayerOutput
-    :param info: Info layer name. (TODO, explaination)
-    :type info: LayerOutput
+    :param query_id: Query_id layer name. Query_id indicates that which query
+     each sample belongs to. Its shape should be
+     the same as output of Label layer.
+    :type query_id: LayerOutput
    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
+                  [sample_num, 1] which indicates the weight of each sample.
+                  The default weight of sample is 1 if the weight layer is None.
+                  And the pair weight is the mean of the two samples' weight.
    :type weight: LayerOutput
    :param name: Evaluator name.
    :type name: None|basestring
@@ -326,8 +330,8 @@ def pnpair_evaluator(
        input = [input]
    if label:
        input.append(label)
-    if info:
-        input.append(info)
+    if query_id:
+        input.append(query_id)
    evaluator_base(
        input=input,
        type="pnpair",

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2507,12 +2507,12 @@ def img_conv_layer(input,
    input is raw pixels of image(mono or RGB), or it may be the previous layer's
    num_filters * num_group.

-    There are several group of filter in PaddlePaddle implementation.
-    Each group will process some channel of the inputs. For example, if an input
+    There are several groups of filters in PaddlePaddle implementation.
+    Each group will process some channels of the input. For example, if
    num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
-    32*4 = 128 filters to process inputs. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will process by first 32 filters. The
-    rest channels will be processed by rest group of filters.
+    32*4 = 128 filters to process the input. The channels will be split into 4
+    pieces. First 256/4 = 64 channels will be processed by first 32 filters. The
+    rest channels will be processed by the rest groups of filters.

    The example usage is:

@@ -2528,53 +2528,68 @@ def img_conv_layer(input,
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
-                        two image dimension.
+    :param filter_size: The dimensions of the filter kernel. If the parameter is
+                        set to one integer, the two dimensions on x and y axises
+                        will be same when filter_size_y is not set. If it is set
+                        to a list, the first element indicates the dimension on
+                        the x axis, and the second is used to specify the dimension
+                        on the y axis when filter_size_y is not provided.
    :type filter_size: int | tuple | list
-    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
-                        currently supports rectangular filters, the filter's
-                        shape will be (filter_size, filter_size_y).
-    :type filter_size_y: int | None
+    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
+                          is not set, it will be set automatically according to filter_size.
+    :type filter_size_y: int
    :param num_filters: Each filter group's number of filter
    :param act: Activation type. ReluActivation is the default activation.
    :type act: BaseActivation
-    :param groups: Group size of filters.
+    :param groups: The group number. 1 is the default group number.
    :type groups: int
-    :param stride: The x dimension of the stride. Or input a tuple for two image
-                   dimension.
+    :param stride: The strides. If the parameter is set to one integer, the strides
+                   on x and y axises will be same when stride_y is not set. If it is
+                   set to a list, the first element indicates the stride on the x axis,
+                   and the second is used to specify the stride on the y axis when
+                   stride_y is not provided. 1 is the default value.
    :type stride: int | tuple | list
-    :param stride_y: The y dimension of the stride.
+    :param stride_y: The stride on the y axis.
    :type stride_y: int
-    :param padding: The x dimension of the padding. Or input a tuple for two
-                    image dimension
+    :param padding: The padding sizes. If the parameter is set to one integer, the padding
+                    sizes on x and y axises will be same when padding_y is not set. If it
+                    is set to a list, the first element indicates the padding size on the
+                    x axis, and the second is used to specify the padding size on the y axis
+                    when padding_y is not provided. 0 is the default padding size.
    :type padding: int | tuple | list
-    :param padding_y: The y dimension of the padding.
+    :param padding_y: The padding size on the y axis.
    :type padding_y: int
-    :param dilation: The x dimension of the dilation. Or input a tuple for two
-                    image dimension
+    :param dilation: The dimensions of the dilation. If the parameter is set to one integer,
+                     the two dimensions on x and y axises will be same when dilation_y is not
+                     set. If it is set to a list, the first element indicates the dimension
+                     on the x axis, and the second is used to specify the dimension on the y
+                     axis when dilation_y is not provided. 1 is the default dimension.
    :type dilation: int | tuple | list
-    :param dilation_y: The y dimension of the dilation.
+    :param dilation_y: The dimension of the dilation on the y axis.
    :type dilation_y: int
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the
                      parameter is set to True, the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: number of input channels. If None will be set
-                        automatically from previous output.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channel number of the input.
    :type num_channels: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
    :type param_attr: ParameterAttribute
-    :param shared_biases: Is biases will be shared between filters or not.
+    :param shared_biases: Whether biases will be shared between filters or not.
    :type shared_biases: bool
-    :param layer_attr: Layer Extra Attribute.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
-    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :param trans: True if it is a convTransLayer, False if it is a convLayer
    :type trans: bool
-    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or
-                       "cudnn_conv"
-    :type layer_type: String
+    :param layer_type: Specify the layer type. If the dilation's dimension on one axis is
+                       larger than 1, layer_type has to be "cudnn_conv" or "cudnn_convt".
+                       If trans=True, layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or "cudnn_conv".
+    :type layer_type: basestring
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -2679,7 +2694,7 @@ def img_pool_layer(input,
    """
    Image pooling Layer.

-    The details of pooling layer, please refer ufldl's pooling_ .
+    The details of pooling layer, please refer to ufldl's pooling_ .

    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/

@@ -2711,32 +2726,37 @@ def img_pool_layer(input,
                                 padding_y=2,
                                 pool_type=MaxPooling())

-    :param padding: pooling padding width.
+    :param padding: The padding size on the x axis. 0 is the default padding size.
    :type padding: int
-    :param padding_y: pooling padding height. It's equal to padding by default.
-    :type padding_y: int | None
-    :param name: name of pooling layer
-    :type name: basestring.
+    :param padding_y: The padding size on the y axis. If the parameter is not set
+                      or set to None, it will be set to 'padding' automatically.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param pool_size: pooling window width
+    :param pool_size: The pooling window length on the x axis.
    :type pool_size: int
-    :param pool_size_y: pooling window height. It's eaqual to pool_size by default.
-    :type pool_size_y: int | None
-    :param num_channels: number of input channel.
+    :param pool_size_y: The pooling window length on the y axis. If the parameter is
+                        not set or set to None, its actual value will be automatically
+                        set to pool_size.
+    :type pool_size_y: int
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
    :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
-                      MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
    :type pool_type: BasePoolingType
-    :param stride: stride width of pooling.
+    :param stride: The stride on the x axis. 1 is the default value.
    :type stride: int
-    :param stride_y: stride height of pooling. It is equal to stride by default.
-    :type stride_y: int | None
-    :param layer_attr: Extra Layer attribute.
+    :param stride_y: The stride on the y axis. If the parameter is not set or set to
+                     None, its actual value will be automatically set to 'stride'.
+    :type stride_y: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
-                      Defalut is True. If set false, Otherwise use floor.
-
+    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
    :type ceil_mode: bool
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -2842,24 +2862,32 @@ def img_pool3d_layer(input,

    :param padding: pooling padding width.
    :type padding: int | tuple | list
-    :param name: name of pooling layer
+    :param name: The name of this layer. It is optional.
    :type name: basestring.
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param pool_size: pooling window width
+    :param pool_size: The pooling window lengths along three axises. If the parameter
+                      is set to one integer, the three lengths will be same.
    :type pool_size: int | tuple | list
-    :param num_channels: number of input channel.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
    :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
-                      MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
    :type pool_type: BasePoolingType
-    :param stride: stride width of pooling.
+    :param stride: The strides of the pooling along three axises. If the parameter
+                   is set to one integer, the three strides will be same. 1 is the
+                   default value.
    :type stride: int | tuple | list
-    :param layer_attr: Extra Layer attribute.
+    :param padding: The sizes of padding along three axises. If the parameter is set to
+                    one integer, they will be same. 0 is the default padding size.
+    :type padding: int | tuple | list
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
-                      Defalut is True. If set false, Otherwise use floor.
-
+    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
    :type ceil_mode: bool
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -2938,9 +2966,11 @@ def spp_layer(input,
              pyramid_height=None,
              layer_attr=None):
    """
-    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
-    The details please refer to
-    `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
+    A layer performs spatial pyramid pooling.
+
+    Reference:
+        Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        https://arxiv.org/abs/1406.4729

    The example usage is:

@@ -2955,13 +2985,16 @@ def spp_layer(input,
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param num_channels: number of input channel.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
    :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling or AveragePooling. Default is MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
    :type scale: BasePoolingType
-    :param pyramid_height: pyramid height.
+    :param pyramid_height: The pyramid height of this pooling.
    :type pyramid_height: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4694,7 +4727,7 @@ def conv_projection(input,
                        will be same when filter_size_y is not set. If it is set
                        to a list, the first element indicates the dimension on
                        the x axis, and the second is used to specify the dimension
-                        on the y axis when filter_size is not provided.
+                        on the y axis when filter_size_y is not provided.
    :type filter_size: int | tuple | list
    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
                          is not set, it will be set automatically according to filter_size.
@@ -6571,10 +6604,11 @@ def row_conv_layer(input,

 @layer_support()
 @wrap_name_default()
-@wrap_param_attr_default()
 def prelu_layer(input,
                name=None,
                partial_sum=1,
+                channel_shared=None,
+                num_channels=None,
                param_attr=None,
                layer_attr=None):
    """
@@ -6605,6 +6639,14 @@ def prelu_layer(input,
        - partial_sum = number of outputs, indicates all elements share the same weight.

    :type partial_sum: int
+    :param channel_shared: whether or not the parameter are shared across channels.
+
+        - channel_shared = True, we set the partial_sum to the number of outputs.
+        - channel_shared = False, we set the partial_sum to the number of elements in one channel.
+
+    :type channel_shared: bool
+    :param num_channels: number of input channel.
+    :type num_channels: int
    :param param_attr: The parameter attribute. See ParameterAttribute for details.
    :type param_attr: ParameterAttribute
    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
@@ -6615,7 +6657,25 @@ def prelu_layer(input,
    """

    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
-    assert isinstance(param_attr, ParameterAttribute)
+
+    if not param_attr:
+        param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
+    else:
+        assert isinstance(param_attr, ParameterAttribute)
+
+    if num_channels is None:
+        assert input.num_filters is not None, \
+                'the input channel cannot be detected, please specify the num_channels parameter'
+        num_channels = input.num_filters
+
+    if channel_shared is not None:
+        assert isinstance(channel_shared, bool)
+        assert (input.height != 0 and input.width != 0), \
+            'input height and widht must be setted'
+        if channel_shared:
+            partial_sum = input.height * input.width * num_channels
+        else:
+            partial_sum = input.height * input.width

    l = Layer(
        name=name,
@@ -6627,6 +6687,7 @@ def prelu_layer(input,
        name=name,
        layer_type=LayerType.PRELU,
        parents=input,
+        num_filters=num_channels,
        size=l.config.size)


@@ -7076,7 +7137,7 @@ def img_conv3d_layer(input,
    :type layer_attr: ExtraLayerAttribute
    :param trans: True if it is a convTransLayer, False if it is a convLayer
    :type trans: bool
-    :param layer_type: Specify the layer_type. If the parameter is set, it must be "deconv3d"
+    :param layer_type: Specify the layer type. If the parameter is set, it must be "deconv3d"
                       when trans=True. If not set, it will be automatically set to "deconv3d"
                       when trans=True and "conv3d" when trans=False.
    :type layer_type: basestring

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
@@ -4,6 +4,8 @@ layers {
  type: "data"
  size: 300
  active_type: ""
+  height: 10
+  width: 10
 }
 layers {
  name: "__prelu_layer_0__"
@@ -15,6 +17,9 @@ layers {
    input_parameter_name: "___prelu_layer_0__.w0"
  }
  partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
  name: "__prelu_layer_1__"
@@ -26,6 +31,9 @@ layers {
    input_parameter_name: "___prelu_layer_1__.w0"
  }
  partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
  name: "__prelu_layer_2__"
@@ -37,41 +45,100 @@ layers {
    input_parameter_name: "___prelu_layer_2__.w0"
  }
  partial_sum: 5
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_3__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_3__.w0"
+  }
+  partial_sum: 300
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_4__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_4__.w0"
+  }
+  partial_sum: 100
+  height: 10
+  width: 10
+  depth: 1
 }
 parameters {
  name: "___prelu_layer_0__.w0"
  size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
  initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
  name: "___prelu_layer_1__.w0"
  size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
  initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
  name: "___prelu_layer_2__.w0"
  size: 60
-  initial_mean: 0.0
-  initial_std: 0.129099444874
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 60
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_3__.w0"
+  size: 1
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_4__.w0"
+  size: 3
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 3
  initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 input_layer_names: "input"
-output_layer_names: "__prelu_layer_2__"
+output_layer_names: "__prelu_layer_4__"
 sub_models {
  name: "root"
  layer_names: "input"
  layer_names: "__prelu_layer_0__"
  layer_names: "__prelu_layer_1__"
  layer_names: "__prelu_layer_2__"
+  layer_names: "__prelu_layer_3__"
+  layer_names: "__prelu_layer_4__"
  input_layer_names: "input"
-  output_layer_names: "__prelu_layer_2__"
+  output_layer_names: "__prelu_layer_4__"
  is_recurrent_layer_group: false
 }

--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
 from paddle.trainer_config_helpers import *

-data = data_layer(name='input', size=300)
-prelu = prelu_layer(input=data)
-prelu = prelu_layer(input=data, partial_sum=1)
-prelu = prelu_layer(input=data, partial_sum=5)
+data = data_layer(name='input', size=300, height=10, width=10)
+prelu = prelu_layer(input=data, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=1, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=5, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=True, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=False, num_channels=3)

 outputs(prelu)
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -62,21 +62,15 @@ __all__ = [
 cp.begin_parse()


-def init(**kwargs):
-    import py_paddle.swig_paddle as api
-    args = []
-    args_dict = {}
-    # NOTE: append arguments if they are in ENV
-    for ek, ev in os.environ.iteritems():
-        if ek.startswith("PADDLE_INIT_"):
-            args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev)
+def set_omp_mkl_env_vars(trainer_count):
+    '''Auto set CPU environment if have not set before.
+       export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
+       export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
+    '''
+    import platform
+    if not platform.system() in ['Linux', 'Darwin']:
+        return

-    args_dict.update(kwargs)
-    # NOTE: overwrite arguments from ENV if it is in kwargs
-    for key in args_dict.keys():
-        args.append('--%s=%s' % (key, str(args_dict[key])))
-
-    # auto set cpu environment
    def set_env(key, value):
        '''If the key has not been set in the environment, set it with value.'''
        assert isinstance(key, str)
@@ -85,22 +79,59 @@ def init(**kwargs):
        if envset is None:
            os.environ[key] = value

-    ht = os.popen("lscpu |grep \"per core\"|awk -F':' '{print $2}'|xargs")
-    ht = int(ht.read())
-    if ht == 1:  # ht is off
-        set_env("OMP_DYNAMIC", "false")
-        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
-    else:
+    def num_physical_cores():
+        '''Get the number of physical cores'''
+        if platform.system() == "Linux":
+            num_sockets = int(
+                os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs")
+                .read())
+            num_cores_per_socket = int(
+                os.popen(
+                    "lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs")
+                .read())
+            return num_sockets * num_cores_per_socket
+        else:
+            cmds = {"Darwin": "sysctl hw.physicalcpu"}
+            return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    def num_logical_processors():
+        '''Get the number of logical processors'''
+        cmds = {
+            "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l",
+            "Darwin": "sysctl hw.logicalcpu"
+        }
+        return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    num_cores = num_physical_cores()
+    num_processors = num_logical_processors()
+    if num_processors > num_cores:  # Hyper Threading is enabled
        set_env("OMP_DYNAMIC", "true")
        set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
-    processors = os.popen("grep \"processor\" /proc/cpuinfo|sort -u|wc -l")
-    processors = int(processors.read())
-    trainers = kwargs.get('trainer_count', 1)
-    threads = processors / trainers
+    else:
+        set_env("OMP_DYNAMIC", "false")
+        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
+    threads = num_processors / trainer_count
    threads = '1' if threads < 1 else str(threads)
    set_env("OMP_NUM_THREADS", threads)
    set_env("MKL_NUM_THREADS", threads)

+
+def init(**kwargs):
+    import py_paddle.swig_paddle as api
+    args = []
+    args_dict = {}
+    # NOTE: append arguments if they are in ENV
+    for ek, ev in os.environ.iteritems():
+        if ek.startswith("PADDLE_INIT_"):
+            args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev)
+
+    args_dict.update(kwargs)
+    # NOTE: overwrite arguments from ENV if it is in kwargs
+    for key in args_dict.keys():
+        args.append('--%s=%s' % (key, str(args_dict[key])))
+
+    set_omp_mkl_env_vars(kwargs.get('trainer_count', 1))
+
    if 'use_gpu' in kwargs:
        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
    if 'use_mkldnn' in kwargs:

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -15,6 +15,37 @@ def unique_name(prefix):
    return "_".join([prefix, str(uid)])


+def convert_np_dtype_to_dtype_(np_dtype):
+    dtype = np.dtype(np_dtype)
+    if dtype == np.float32:
+        return core.DataType.FP32
+    elif dtype == np.float64:
+        return core.DataType.FP64
+    elif dtype == np.float16:
+        return core.DataType.FP16
+    elif dtype == np.int32:
+        return core.DataType.INT32
+    elif dtype == np.int16:
+        return core.DataType.INT16
+    elif dtype == np.int64:
+        return core.DataType.INT64
+    elif dtype == np.bool:
+        return core.DataType.BOOL
+    else:
+        raise ValueError("Not supported numpy dtype " + str(dtype))
+
+
+def dtype_is_floating(dtype):
+    if not isinstance(dtype, core.DataType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if (dtype == core.DataType.FP16 or dtype == core.DataType.FP32 or
+            dtype == core.DataType.FP64):
+        return True
+    else:
+        return False
+
+
 def _debug_string_(proto, throw_on_error=True):
    error_fields = list()
    if not proto.IsInitialized(error_fields) and throw_on_error:
@@ -66,7 +97,7 @@ class Variable(object):
                        "matched.".format(self.name, old_shape, shape))
        if dtype is not None:
            if not isinstance(dtype, core.DataType):
-                dtype = Variable._convert_np_dtype_to_dtype_(dtype)
+                dtype = convert_np_dtype_to_dtype_(dtype)
            if is_new_var:
                self.desc.set_data_type(dtype)
            else:
@@ -148,26 +179,6 @@ class Variable(object):
        uid = core.unique_integer(prefix)  # unique during whole process.
        return "_".join([prefix, str(uid)])

-    @staticmethod
-    def _convert_np_dtype_to_dtype_(np_dtype):
-        dtype = np.dtype(np_dtype)
-        if dtype == np.float32:
-            return core.DataType.FP32
-        elif dtype == np.float64:
-            return core.DataType.FP64
-        elif dtype == np.float16:
-            return core.DataType.FP16
-        elif dtype == np.int32:
-            return core.DataType.INT32
-        elif dtype == np.int16:
-            return core.DataType.INT16
-        elif dtype == np.int64:
-            return core.DataType.INT64
-        elif dtype == np.bool:
-            return core.DataType.BOOL
-        else:
-            raise ValueError("Not supported numpy dtype " + str(dtype))
-

 def get_all_op_protos():
    """

--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -285,3 +285,86 @@ class XavierInitializer(Initializer):
                })
        var.op = op
        return op
+
+
+class MSRAInitializer(Initializer):
+    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+
+    This class implements the weight initialization from the paper
+    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
+    and Jian Sun. This is a robust initialization method that particularly
+    considers the rectifier nonlinearities. In case of Uniform distribution,
+    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
+    distribution, the mean is 0 and the standard deviation
+    is sqrt(2/ fan_in).
+
+    References:
+        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
+            on ImageNet Classification
+            (https://arxiv.org/abs/1502.01852)
+    """
+
+    def __init__(self, uniform=True, fan_in=None, seed=0):
+        """Constructor for MSRAInitializer
+
+        Args:
+            uniform: whether to use uniform or normal distribution
+            fan_in: fan_in for MSRAInitializer. If None, it is
+                    inferred from the variable.
+            seed: random seed
+
+        Note: It is recommended to set fan_in to None for most cases.
+        """
+        assert uniform is not None
+        assert seed is not None
+        super(MSRAInitializer, self).__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add MSRA initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._uniform:
+            limit = np.sqrt(6.0 / float(fan_in))
+            op = block.prepend_op(
+                type="uniform_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "data_type": int(var.data_type),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                })
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in))
+            op = block.prepend_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "data_type": int(var.data_type),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                })
+        var.op = op
+        return op
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -2,7 +2,7 @@ import copy
 import itertools

 from paddle.v2.fluid.framework import Variable, g_main_program, \
-    g_startup_program, unique_name, Program
+    g_startup_program, unique_name, Program, dtype_is_floating
 from paddle.v2.fluid.initializer import ConstantInitializer, \
    UniformInitializer, XavierInitializer

@@ -61,7 +61,7 @@ class LayerHelper(object):

    @property
    def param_attr(self):
-        default = {'name': None, 'initializer': XavierInitializer()}
+        default = {'name': None}
        actual = self.kwargs.get('param_attr', None)
        if actual is None:
            actual = default
@@ -72,7 +72,7 @@ class LayerHelper(object):

    @property
    def bias_attr(self):
-        default = {'name': None, 'initializer': ConstantInitializer()}
+        default = {'name': None}
        bias_attr = self.kwargs.get('bias_attr', None)
        if bias_attr is None:
            bias_attr = default
@@ -119,12 +119,17 @@ class LayerHelper(object):
        attr_copy = copy.deepcopy(attr)
        if initializer is not None:
            attr_copy['initializer'] = initializer
+        else:
+            attr_copy['initializer'] = self._get_default_initializer(dtype)
        if attr_copy['name'] is None:
            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
        self.startup_program.global_block().create_parameter(
            dtype=dtype, shape=shape, **attr_copy)
        return self.main_program.global_block().create_parameter(
-            name=attr_copy['name'], dtype=dtype, shape=shape)
+            name=attr_copy['name'],
+            dtype=dtype,
+            shape=shape,
+            trainable=attr_copy.get('trainable', True))

    def create_tmp_variable(self, dtype):
        return self.main_program.current_block().create_var(
@@ -149,13 +154,19 @@ class LayerHelper(object):
            persistable=True,
            initializer=initializer)

-    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
+    def append_bias_op(self,
+                       input_var,
+                       bias_initializer,
+                       dim_start=1,
+                       dim_end=None):
        """
        Append bias operator and return its output. If the user does not set
        bias_attr, append_bias_op will return input_var

-        :param input_var: the input variable. The len(input_var.shape) is larger
-        or equal than 2.
+        :param input_var: the input variable. The len(input_var.shape) is
+        larger or equal than 2.
+        :bias_initializer: an instance of a subclass of Initializer used to
+        initialize the bias
        :param dim_start:
        :param dim_end: the shape of the bias will be
        input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
@@ -167,7 +178,11 @@ class LayerHelper(object):
            return input_var

        b = self.create_parameter(
-            attr=bias_attr, shape=size, dtype=input_var.data_type, suffix='b')
+            attr=bias_attr,
+            shape=size,
+            dtype=input_var.data_type,
+            suffix='b',
+            initializer=bias_initializer)
        tmp = self.create_tmp_variable(dtype=input_var.data_type)
        self.append_op(
            type='elementwise_add',
@@ -191,3 +206,10 @@ class LayerHelper(object):
            outputs={"Y": [tmp]},
            attrs=act)
        return tmp
+
+    def _get_default_initializer(self, dtype):
+        if dtype is None or dtype_is_floating(dtype) is True:
+            return XavierInitializer()
+        else:
+            # For integer and boolean types, initialize with all zeros
+            return ConstantInitializer()
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -170,7 +170,8 @@ class Optimizer(object):

        optimize_ops = []
        for param_and_grad in parameters_and_grads:
-            if param_and_grad[1] is not None:
+            if param_and_grad[0].trainable is True and param_and_grad[
+                    1] is not None:
                optimize_op = self._append_optimize_op(loss.block,
                                                       param_and_grad)
                optimize_ops.append(optimize_op)

--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -4,6 +4,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.evaluator as evaluator
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.initializer import XavierInitializer
 from paddle.v2.fluid.optimizer import AdamOptimizer
@@ -103,12 +104,13 @@ net = vgg16_bn_drop(images)
 predict = layers.fc(input=net, size=classdim, act='softmax')
 cost = layers.cross_entropy(input=predict, label=label)
 avg_cost = layers.mean(x=cost)
-accuracy = layers.accuracy(input=predict, label=label)

 # optimizer = SGDOptimizer(learning_rate=0.001)
 optimizer = AdamOptimizer(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)

+accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
+
 BATCH_SIZE = 128
 PASS_NUM = 1

@@ -124,6 +126,7 @@ exe.run(framework.default_startup_program())

 for pass_id in range(PASS_NUM):
    batch_id = 0
+    accuracy.reset(exe)
    for data in train_reader():
        img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                data)).astype("float32")
@@ -141,12 +144,14 @@ for pass_id in range(PASS_NUM):
        outs = exe.run(framework.default_main_program(),
                       feed={"pixel": tensor_img,
                             "label": tensor_y},
-                       fetch_list=[avg_cost, accuracy])
+                       fetch_list=[avg_cost, acc_out])

        loss = np.array(outs[0])
        acc = np.array(outs[1])
+        pass_acc = accuracy.eval(exe)
        print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
-              " loss:" + str(loss) + " acc:" + str(acc))
+              " loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+                  pass_acc))
        batch_id = batch_id + 1

        if batch_id > 1:

--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
@@ -104,7 +104,7 @@ class TestLinearChainCrfOp(OpTest):
        transition_exps = np.exp(transition)

        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")

        self.inputs = {
            "Emission": (emission, lod),

--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
--- a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py