Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lod_tensor_py

28dc4340 · dangqingqing · 68943f59 · 53f00025 · 28dc4340 · 28dc4340
23 changed file
--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -139,7 +139,13 @@ void DetectionOutputLayer::forward(PassType passType) {
                                       allDecodedBBoxes,
                                       &allIndices);

+  if (numKept > 0) {
    resetOutput(numKept, 7);
+  } else {
+    MatrixPtr outV = getOutputValue();
+    outV = NULL;
+    return;
+  }
  MatrixPtr outV = getOutputValue();
  getDetectionOutput(confBuffer_->getData(),
                     numKept,

--- a/paddle/gserver/layers/DetectionUtil.cpp
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -469,7 +469,7 @@ size_t getDetectionIndices(
    const size_t numClasses,
    const size_t backgroundId,
    const size_t batchSize,
-    const size_t confThreshold,
+    const real confThreshold,
    const size_t nmsTopK,
    const real nmsThreshold,
    const size_t keepTopK,

--- a/paddle/gserver/layers/DetectionUtil.h
+++ b/paddle/gserver/layers/DetectionUtil.h
@@ -275,7 +275,7 @@ size_t getDetectionIndices(
    const size_t numClasses,
    const size_t backgroundId,
    const size_t batchSize,
-    const size_t confThreshold,
+    const real confThreshold,
    const size_t nmsTopK,
    const real nmsThreshold,
    const size_t keepTopK,

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -77,24 +77,6 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }

-void MKLDNNFcLayer::convertOutputToOtherDevice() {
-  copyOutputInfoToOtherDevice();
-  // find other cpu device and reorder output to cpu device
-  int cnt = 0;
-  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-      // fc cpu output value do not need convert
-      // just share point
-      outputOtherDevice_[i].value = output_.value;
-      ++cnt;
-    }
-  }
-
-  if (cnt > 1) {
-    LOG(WARNING) << "should not have more than one CPU devie";
-  }
-}
-
 void MKLDNNFcLayer::reshape() {
  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
  int batchSize = input.getBatchSize();
@@ -155,7 +137,10 @@ void MKLDNNFcLayer::resetFwd() {
  // change original output value to mkldnn output value
  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
  if (!outputIsOnlyMKLDNN()) {
-    convertOutputToOtherDevice();
+    copyOutputInfoToOtherDevice();
+    // fc cpu output value do not need create convert
+    // just share point
+    getOutput(CPU_DEVICE).value->setData(output_.value->getData());
  }

  // create forward handle
@@ -235,13 +220,12 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdWgt_);

  /// backward data
-  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  const MatrixPtr& in = getInputGrad(0, device);
+  const MatrixPtr& in = inputLayers_[0]->getOutput().grad;
  if (in == nullptr) {
    return;
  }
-  if (getInput(0, device).getAllCount() > 1) {
-    // TODO(TJ): use outputMaps_ ways when merge outgrad done
+  if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
  } else {
    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
  }
@@ -258,13 +242,21 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdData_);
 }

+void MKLDNNFcLayer::updateInputData() {
+  if (inputLayers_[0]->getType() != "data") {
+    return;
+  }
+  real* iData = getInputValue(0, CPU_DEVICE)->getData();
+  inVal_->setData(iData);
+}
+
 void MKLDNNFcLayer::forward(PassType passType) {
  Layer::forward(passType);
  reshape();

  {
    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-    syncInputValue();
+    updateInputData();

    // just submit forward pipeline
    stream_->submit(pipelineFwd_);
@@ -286,7 +278,6 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
    resetBwd();

-    syncOutputGrad();
    // just sumbmit backward pipeline
    stream_->submit(pipelineBwd_);
  }

--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -53,6 +53,8 @@ public:

  void backward(const UpdateCallback& callback) override;

+  void updateInputData() override;
+
 protected:
  /**
   * reshape the input image sizes
@@ -72,8 +74,6 @@ protected:
   * only would be called when needed
   */
  void resetBwd();
-
-  void convertOutputToOtherDevice() override;
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -114,10 +114,10 @@ public:
  virtual void convertWeightsToPaddle() {}

  /**
-   * convert MKLDNN output to other device.
-   * only support CPU device yet
+   * Update input value data when input layer is "data" type.
+   * Since the input value data address might be changed.
   */
-  virtual void convertOutputToOtherDevice() {}
+  virtual void updateInputData() {}

  /**
   * print info about sizes
@@ -155,6 +155,7 @@ protected:
   *        copy base info and do not copy data value
   */
  void copyOutputInfoToOtherDevice() {
+    int cnt = 0;
    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
@@ -163,6 +164,12 @@ protected:
      outputOtherDevice_[i].subSequenceStartPositions =
          output_.subSequenceStartPositions;
      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        ++cnt;
+      }
+    }
+    if (cnt > 1) {
+      LOG(WARNING) << "should not have more than one CPU devie";
    }
  }

@@ -193,32 +200,6 @@ protected:
    return outputOtherDevice_.size() == 0;
  }

-  /**
-   * Sync input value data
-   */
-  void syncInputValue() {
-    if (inputIsOnlyMKLDNN()) {
-      return;
-    }
-    real* iData = getInputValue(0, CPU_DEVICE)->getData();
-    // update input data
-    // since it might be changed if this is after data layer
-    inVal_->updateData(iData);
-  }
-
-  /**
-   * Sync output grad data
-   */
-  void syncOutputGrad() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-
-    // update diff
-    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
-    outGrad_->updateData(oDiff);
-  }
-
  /**
   * Set deviceId of this layer.
   */

--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -33,14 +33,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
    size_t width = cnts / dims[0];
    m = Matrix::create(height, width, false, false);
  }
-
  CHECK(m) << " Matrix should not be empty";
+
  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
-
-  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
-  return std::make_shared<MKLDNNMatrix>(
-      m->getData(), m->getHeight(), m->getWidth(), pd);
+  CHECK_EQ(cpuMatrix->getElementCnt(), cnts) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }

 MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
@@ -138,7 +136,7 @@ void MKLDNNMatrix::downSpatial() {
      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
      "could not create a memory primitive");
  reset(result);
-  set_data_handle(getData());
+  set_data_handle(data_);
 }

 }  // namespace paddle
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -30,11 +30,10 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
 */
 class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
 public:
-  MKLDNNMatrix(real* data,
-               size_t height,
-               size_t width,
-               mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
+        mkldnn::memory(pd, m->getData()),
+        m_(m) {}

  ~MKLDNNMatrix() {}

@@ -81,11 +80,29 @@ public:
  void downSpatial();

  /**
-   * Update the memory data handle.
+   * set the memory data handle.
   * Caution: This will not check the buffer size of the data,
   *          it should be coverd by user.
   */
-  void updateData(void* data) { set_data_handle(data); }
+  void setData(real* data) {
+    set_data_handle(data);
+    CpuMatrix::setData(data);
+    m_.reset();
+  }
+
+  /**
+   * override Matrix::getData
+   * check data before return
+   */
+  real* getData() override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  const real* getData() const override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }

  /**
   * Get primitive descriptor.
@@ -143,6 +160,10 @@ protected:
                   memory::format srcFmt,
                   memory::format dstFmt,
                   memory::dims dm);
+
+private:
+  // save the CpuMatrixPtr in case the buffer released outside
+  CpuMatrixPtr m_;
 };

 }  // namespace paddle
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/concat_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class ConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    size_t n = ins.size();
+
+    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+
+    auto out_dims = ins[0]->dims();
+    size_t in_zero_dims_size = out_dims.size();
+    for (size_t i = 1; i < n; i++) {
+      for (size_t j = 0; j < in_zero_dims_size; j++) {
+        if (j == axis) {
+          out_dims[axis] += ins[i]->dims()[j];
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out_dims[j], ins[i]->dims()[j],
+                          "Input tensors should have the same "
+                          "elements except the specify axis.")
+      }
+    }
+    out->Resize(out_dims);
+  }
+};
+
+class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensors of concat operator.").AsDuplicable();
+    AddOutput("Out", "the output tensor of concat operator.");
+    AddComment(R"DOC(
+            Join the input tensors along with the axis.
+            Examples:
+              Input[0] = [[1,2],[3,4]]
+              Input[1] = [[5,6]]
+              axis = 0
+              Output = [[1,2],
+                        [3,4],
+                        [5,6]]
+        )DOC");
+    AddAttr<int>("axis", "The axis which the inputs will be joined with.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(concat, ops::ConcatOp, ops::ConcatOpMaker)
+REGISTER_OP_CPU_KERNEL(concat,
+                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
--- a/paddle/operators/concat_op.cu
+++ b/paddle/operators/concat_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/concat_op.h"
+
+namespace ops = paddle::operators;
+// TODO(Yancey1989) Add GPU kernel
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ConcatKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    size_t n = ins.size();
+    size_t output_axis_dim = 0;
+    size_t before = 1, after = 1;
+    for (size_t i = 0; i < n; i++) {
+      output_axis_dim += ins[i]->dims()[axis];
+    }
+    auto& input_zero = ins[0];
+    for (int64_t i = 0; i < input_zero->dims().size(); i++) {
+      if (i == axis) {
+        continue;
+      }
+      if (i < axis) {
+        before *= input_zero->dims()[i];
+      } else {
+        after *= input_zero->dims()[i];
+      }
+    }
+    size_t output_offset = 0;
+    for (size_t i = 0; i < n; i++) {
+      auto& in = ins[i];
+      auto axis_dim = in->dims()[axis];
+      for (size_t j = 0; j < before; j++) {
+        size_t len = axis_dim * after * sizeof(T);
+        const T* src = in->data<T>() + axis_dim * after * j;
+        T* out_data = out->mutable_data<T>(platform::CPUPlace());
+        T* dest = out_data + output_offset + output_axis_dim * after * j;
+        memcpy(dest, src, len);
+      }
+      output_offset += axis_dim * after;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -25,10 +25,6 @@ limitations under the License. */
 #include "paddle/string/printf.h"
 #include "paddle/string/to_string.h"

-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif
-
 #ifndef PADDLE_ONLY_CPU

 #include "paddle/platform/dynload/cublas.h"
@@ -46,19 +42,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {

-namespace {
-#ifdef __GNUC__
-inline std::string demangle(std::string name) {
-  int status = -4;  // some arbitrary value to eliminate the compiler warning
-  std::unique_ptr<char, void (*)(void*)> res{
-      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
-  return (status == 0) ? res.get() : name;
-}
-#else
-inline std::string demangle(std::string name) { return name; }
-#endif
-}
-
 struct EnforceNotMet : public std::exception {
  std::exception_ptr exp_;
  std::string err_str_;
@@ -79,7 +62,7 @@ struct EnforceNotMet : public std::exception {
      Dl_info info;
      for (int i = 0; i < size; ++i) {
        if (dladdr(call_stack[i], &info)) {
-          auto demangled = demangle(info.dli_sname);
+          auto demangled = info.dli_sname;
          auto addr_offset = static_cast<char*>(call_stack[i]) -
                             static_cast<char*>(info.dli_saddr);
          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -50,6 +50,7 @@ USE_OP(minus);
 USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
+USE_CPU_ONLY_OP(concat);
 USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(sum);

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -30,6 +30,8 @@ Configuring cmake in /paddle/build ...
      -DCMAKE_BUILD_TYPE=Release
      -DWITH_DOC=OFF
      -DWITH_GPU=${WITH_GPU:-OFF}
+      -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
+      -DWITH_MKLML=${WITH_MKLML:-ON}
      -DWITH_AVX=${WITH_AVX:-OFF}
      -DWITH_GOLANG=${WITH_GOLANG:-ON}
      -DWITH_SWIG_PY=ON
@@ -50,6 +52,8 @@ cmake .. \
      -DCMAKE_BUILD_TYPE=Release \
      -DWITH_DOC=OFF \
      -DWITH_GPU=${WITH_GPU:-OFF} \
+      -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
+      -DWITH_MKLML=${WITH_MKLML:-ON} \
      -DWITH_AVX=${WITH_AVX:-OFF} \
      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3748,8 +3748,8 @@ class SwitchOrderLayer(LayerBase):
    def __init__(self, name, inputs, reshape, **xargs):
        super(SwitchOrderLayer, self).__init__(
            name, 'switch_order', 0, inputs=inputs, **xargs)
-        self.config.reshape_conf.heightAxis.extend(reshape['height'])
-        self.config.reshape_conf.widthAxis.extend(reshape['width'])
+        self.config.reshape_conf.height_axis.extend(reshape['height'])
+        self.config.reshape_conf.width_axis.extend(reshape['width'])


 # Deprecated, use a new layer specific class instead

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1223,7 +1223,8 @@ def detection_output_layer(input_loc,
                           name=None):
    """
    Apply the NMS to the output of network and compute the predict bounding
-    box location.
+    box location. The output of this layer could be None if there is no valid
+    bounding box.

    :param name: The Layer Name.
    :type name: basestring
@@ -6460,6 +6461,7 @@ def switch_order_layer(input,
    return LayerOutput(
        name=name,
        layer_type=LayerType.SWITCH_ORDER_LAYER,
+        activation=act,
        parents=input,
        size=l.config.size)


--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -53,10 +53,13 @@ class BeginPass(object):
 class EndPass(WithMetric):
    """
    Event On One Pass Training Complete.
+    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
+    in your event_handler call back
    """

-    def __init__(self, pass_id, evaluator):
+    def __init__(self, pass_id, evaluator, gm):
        self.pass_id = pass_id
+        self.gm = gm
        WithMetric.__init__(self, evaluator)


@@ -73,10 +76,13 @@ class BeginIteration(object):
 class EndIteration(WithMetric):
    """
    Event On One Batch Training Complete.
+    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
+    in your event_handler call back
    """

-    def __init__(self, pass_id, batch_id, cost, evaluator):
+    def __init__(self, pass_id, batch_id, cost, evaluator, gm):
        self.pass_id = pass_id
        self.batch_id = batch_id
        self.cost = cost
+        self.gm = gm
        WithMetric.__init__(self, evaluator)
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -43,7 +43,6 @@ class OpDescCreationMethod(object):
        if len(args) != 0:
            raise ValueError("Only keyword arguments are supported.")
        op_desc = framework_pb2.OpDesc()
-
        for input_parameter in self.__op_proto__.inputs:
            input_arguments = kwargs.get(input_parameter.name, [])
            if is_str(input_arguments):

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -35,4 +35,5 @@ py_test(test_lookup_table SRCS test_lookup_table.py)
 py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
 py_test(test_sum_op SRCS test_sum_op.py)
 py_test(mnist SRCS mnist.py)
+py_test(test_concat_op SRCS test_concat_op.py)
 py_test(test_squared_l2_distance_op SRCS test_squared_l2_distance_op.py)
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -15,7 +15,6 @@ def create_op(op_type):
        kwargs[in_name] = in_name
    for out_name in Operator.get_op_output_names(op_type):
        kwargs[out_name] = out_name
-
    return Operator(op_type, **kwargs)



--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -27,8 +27,18 @@ class OpTestMeta(type):
                places.append(core.GPUPlace(0))

            for place in places:
-                for in_name in Operator.get_op_input_names(self.type):
-                    if hasattr(self, "inputs") and in_name in self.inputs:
+                for in_name, in_dup in Operator.get_op_inputs(self.type):
+                    if hasattr(self, 'inputs') and in_name in self.inputs:
+                        kwargs[in_name] = []
+                        if in_dup:
+                            arrays = self.inputs[in_name]
+                            for index, arr in enumerate(arrays):
+                                var = scope.new_var(in_name + str(index))
+                                tensor = var.get_tensor()
+                                tensor.set_dims(arr.shape)
+                                tensor.set(arr, place)
+                                kwargs[in_name].append(in_name + str(index))
+                        else:
                            kwargs[in_name] = in_name
                            var = scope.new_var(in_name).get_tensor()
                            arr = self.inputs[in_name]
@@ -37,7 +47,7 @@ class OpTestMeta(type):
                    else:
                        kwargs[in_name] = "@EMPTY@"

-                for out_name in Operator.get_op_output_names(self.type):
+                for out_name, out_dup in Operator.get_op_outputs(self.type):
                    if not hasattr(self, "outputs"):
                        raise ValueError(
                            "The test op must set self.outputs dict.")
@@ -60,7 +70,7 @@ class OpTestMeta(type):
                ctx = core.DeviceContext.create(place)
                op.run(scope, ctx)

-                for out_name in Operator.get_op_output_names(self.type):
+                for out_name, out_dup in Operator.get_op_outputs(self.type):
                    actual = numpy.array(scope.find_var(out_name).get_tensor())
                    expect = self.outputs[out_name]
                    self.assertTrue(

--- a/python/paddle/v2/framework/tests/test_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestConcatOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "concat"
+        x0 = np.random.random((2, 3, 2, 5)).astype('float32')
+        x1 = np.random.random((2, 3, 3, 5)).astype('float32')
+        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        axis = 2
+        self.inputs = {'X': [x0, x1, x2]}
+        self.attrs = {'axis': axis}
+        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -174,13 +174,18 @@ class SGD(object):
                        pass_id=pass_id,
                        batch_id=batch_id,
                        cost=cost,
-                        evaluator=batch_evaluator))
+                        evaluator=batch_evaluator,
+                        gm=self.__gradient_machine__))
                self.__parameter_updater__.finishBatch(cost)
                batch_evaluator.finish()

            self.__parameter_updater__.finishPass()
            pass_evaluator.finish()
-            event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
+            event_handler(
+                v2_event.EndPass(
+                    pass_id,
+                    evaluator=pass_evaluator,
+                    gm=self.__gradient_machine__))
        self.__gradient_machine__.finish()

    def test(self, reader, feeding=None):