resolve conflicts

d8ea560e · dangqingqing · f188e22b · 7c8acd4f · d8ea560e · d8ea560e
15 changed file
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -45,6 +45,7 @@ cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
    sgd_op
+    gather_op
    add_op
    mul_op
    rowwise_add_op

--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -43,6 +43,7 @@ USE_OP_ITSELF(recurrent_op);
 USE_OP(gaussian_random);
 USE_OP(uniform_random);
 USE_OP(lookup_table);
+USE_CPU_ONLY_OP(gather);

 namespace paddle {
 namespace framework {

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() {
                           /* width */ resultNum,
                           false,
                           /* useGpu */ false);
-    Matrix::resizeOrCreate(generator_.outArg.value,
-                           /* height */ maxGenWordCount,
-                           /* width */ 1,
-                           false,
-                           /* useGpu */ false);
  }
  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                numSequences + 1,
@@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
  } else {
    oneWaySearch(numSequences);
  }
-  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+  if (dataArgsSize_) createDataOutlink();

  size_t size = generator_.ids.size();
  generator_.outArg.ids->resize(size);
@@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
  }

  batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  generator_.ids.clear();
@@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
    finalPaths_[i].resize(minFinalPathsSize);
  }

-  batchMachineIdVec_.clear();
  generator_.ids.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  if (numResults > 1) {
-    real* probs = generator_.outArg.in->getData();
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
    real* idsProb = generator_.outArg.value->getData();
+
+    real* probs = generator_.outArg.in->getData();
    size_t curPos = 0;
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
@@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() {
        curPos += genLen;
        idsProb[curPos++] = -1.0;
        probs[i * numResults + j] = path.logProb;
-
-        if (!j && dataArgsSize_) {
-          // in beam search, here only reserved the top 1 generated result
-          // for out_links that are not the generated word indices.
-          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                                    path.machineIdVec.begin(),
-                                    path.machineIdVec.end());
-        }
      }
      starts[i + 1] = generator_.ids.size();
    }
  } else {
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      CHECK(!finalPaths_[i].empty());
-      generator_.ids.insert(generator_.ids.begin(),
-                            finalPaths_[i][0].ids.begin(),
-                            finalPaths_[i][0].ids.end());
-      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.begin(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
    }
  }
 }
@@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
  }
 }

-void RecurrentGradientMachine::createDataOutlink(
-    std::vector<int>& machineIdVec) {
-  size_t seqNum =
-      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
-  std::vector<int> starts(seqNum + 1, 0);
-  for (size_t i = 0; i < seqNum; ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
-                                        : finalPaths_[0][i].ids.size();
-    starts[i + 1] = starts[i] + seqLen;
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (int i = 0; i < finalPaths_.size(); ++i) {
+      for (int j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
  }
+}

+void RecurrentGradientMachine::createDataOutlink() {
  for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
    dataArgs_[i].concat(dataArgsFrame_[i],
-                        machineIdVec,
-                        starts,
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
                        useGpu_,
                        HPPL_STREAM_1,
                        PASS_TEST);
-
    auto dataAgent =
        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
    CHECK_NOTNULL(dataAgent);

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -190,7 +190,7 @@ public:
    std::vector<int> ids;

    /**
-     * @brief idsProb, log probability of each generated words.
+     * @brief idsProb, log probability of each generated word.
     */
    std::vector<real> idsProb;

@@ -472,15 +472,43 @@ private:
  void copyDataOutlinkFrame(size_t machineCur);

  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlinks
-   * except the first one are data outlinks. This function creates the data
-   * outlinks.
-   * @note In beam search, only one generated sequence with the hightest log
-   * probabilites are retained.
-   * @param machineIdVec : select a row of output matrix in each frame
-   * that the generation process expanded.
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
+   * step is a separate Network, outputs of a layer inside the
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
+   * specified as an outlink of RecurrentLayerGroup. This function will
+   * collect outputs in each time step of each generated sequence which are
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
   */
-  void createDataOutlink(std::vector<int>& machineIdVec);
+  void createDataOutlink();
+
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);

  /*
   * @brief used in beam search, connect previous frame to form recurrent link
@@ -543,6 +571,7 @@ private:
  std::vector<int> topIds_;
  std::vector<int> seqIds_;
  std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
  std::vector<std::vector<Path>> finalPaths_;
  std::vector<real> minFinalPathLogProb_;
  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -44,6 +44,7 @@ endfunction()
 add_subdirectory(math)

 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+op_library(gather_op SRCS gather_op.cc gather_op.cu)

 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)


--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>

 #include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"

@@ -25,13 +26,13 @@ namespace operators {

 // Implementation of CPU copy
 template <typename T>
-void CPUGather(const T* params, const int* indices, const int slice_size,
+void CPUGather(const T* src, const int* indices, const int slice_size,
               const int index_size, T* output) {
  const size_t slice_bytes = slice_size * sizeof(T);

  for (int i = 0; i < index_size; ++i) {
    int index_ = indices[i];
-    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
+    memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
  }
 }

@@ -55,7 +56,7 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
  int index_size = index->dims()[0];

  auto src_dims = src->dims();
-  paddle::framework::DDim output_dims(src_dims);
+  framework::DDim output_dims(src_dims);
  output_dims[0] = index_size;

  // slice size

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gather_op.h"
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class GatherOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
+    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
+    framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
+    output_dims[0] = batch_size;
+    ctx.Output<Tensor>("Out")->Resize(output_dims);
+  }
+};
+
+class GatherGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto X = ctx.Input<Tensor>("X");
+
+    X_grad->Resize(X->dims());
+  }
+};
+
+class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GatherOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The source input of gather op");
+    AddInput("Index", "The index input of gather op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Gather Operator by selecting from the first axis, 
+
+Out = X[Index]
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+            ops::GatherGradOp);
+REGISTER_OP_CPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    gather_grad,
+    ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gather_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class GatherOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Y = ctx.Output<Tensor>("Out");
+
+    Y->mutable_data<T>(ctx.GetPlace());
+    Gather<T>(ctx.GetPlace(), X, Index, Y);
+  }
+};
+
+template <typename Place, typename T>
+class GatherGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -276,17 +276,21 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
 void Argument::concat(const std::vector<Argument>& args,
                      const std::vector<int>& selectRows,
                      const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
                      bool useGpu,
                      hl_stream_t stream,
                      PassType passType) {
  CHECK(!subSequenceStartPositions)
      << "undefined behavior for subsequence positions";

-  size_t batchSize = selectRows.size();
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
+
  auto copyArg = [batchSize, stream](MatrixPtr& dst,
                                     MatrixPtr src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                     int size,
                                     bool useGpu) {
    if (!src) {
@@ -300,14 +304,14 @@ void Argument::concat(const std::vector<Argument>& args,
      dst->resize(batchSize, width);
    }

-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
  };

  auto copyIds = [batchSize, stream](IVectorPtr& dst,
                                     const IVectorPtr& src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                     int size,
                                     bool useGpu) {
    if (!src) {
@@ -315,13 +319,14 @@ void Argument::concat(const std::vector<Argument>& args,
      return;
    }
    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
  };

  auto copyStrs = [batchSize, stream](SVectorPtr& dst,
                                      const SVectorPtr& src,
-                                      int startRow,
-                                      int pos,
+                                      int desStartRow,
+                                      int srcStartRow,
                                      int size,
                                      bool useGpu) {
    if (!src) {
@@ -333,30 +338,31 @@ void Argument::concat(const std::vector<Argument>& args,
    } else {
      dst->resize(batchSize);
    }
-    std::copy(
-        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
+    std::copy(src->begin() + srcStartRow,
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
  };

  dataId = args[0].dataId;
  CHECK_NE(seqStartPos.size(), 0UL);
-  size_t sampleNum = seqStartPos.size() - 1;
-  for (size_t i = 0; i < sampleNum; ++i) {
+  int desStartRow = 0;
+  for (size_t i = 0; i < copySize.size(); ++i) {
    int startPos = seqStartPos[i];
    int endPos = seqStartPos[i + 1];
    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
    for (int j = startPos; j < endPos; ++j) {
      const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                   << " same dataId";
-      const int copySize = 1;
-      const int rowIdx = selectRows[j];
-      copyArg(in, arg.in, j, rowIdx, copySize, useGpu);
-      copyArg(value, arg.value, j, rowIdx, copySize, useGpu);
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
+                                   << "the same dataId.";
+      const int srcStartRow = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
      if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, j, rowIdx, copySize, useGpu);
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
      }
-      copyIds(ids, arg.ids, j, rowIdx, copySize, useGpu);
-      copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
    }
  }
  ICpuGpuVector::resizeOrCreate(

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -240,6 +240,7 @@ struct Argument {
  void concat(const std::vector<Argument>& args,
              const std::vector<int>& selectRows,
              const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
              bool useGpu,
              hl_stream_t stream,
              PassType passType);

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -338,7 +338,8 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
        in_links_count += 1
        layer_name = MakeLayerNameInParentSubmodel(name)
        layer = g_layer_map[layer_name]
-        ScatterAgentLayer(name=name, size=layer.size)
+        ScatterAgentLayer(
+            name=name, size=layer.size, width=layer.width, height=layer.height)

        pair = g_current_submodel.in_links.add()
        pair.layer_name = layer_name
@@ -2197,8 +2198,8 @@ class MaxOutLayer(LayerBase):
        maxout_conf = self.config.inputs[0].maxout_conf
        parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
        out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
-        self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
-                           g_layer_map[input_layer.name].width, out_channels)
+        self.set_cnn_layer(name, maxout_conf.image_conf.img_size_y,
+                           maxout_conf.image_conf.img_size, out_channels)


 @config_layer('row_conv')
@@ -2405,9 +2406,11 @@ class GatherAgentLayer(LayerBase):

 @config_layer('scatter_agent')
 class ScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
+    def __init__(self, name, size, width=None, height=None, device=None):
        super(ScatterAgentLayer, self).__init__(
            name, 'scatter_agent', size, inputs=[], device=device)
+        if height and width:
+            self.set_layer_height_width(height, width)


 @config_layer('multiplex')

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -16,11 +16,13 @@ import functools
 import collections
 import inspect

+import paddle.trainer.config_parser as cp
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType
+from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
+    CudnnAvgPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *

@@ -330,6 +332,14 @@ class LayerOutput(object):
        self.outputs = outputs
        self.reverse = reverse

+    @property
+    def width(self):
+        return cp.g_layer_map[self.full_name].width
+
+    @property
+    def height(self):
+        return cp.g_layer_map[self.full_name].height
+
    def set_input(self, input):
        """
        Set the input for a memory layer. Can only be used for memory layer
@@ -911,7 +921,13 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
        width=width,
        **ExtraLayerAttribute.to_kwargs(layer_attr))

-    return LayerOutput(name, LayerType.DATA, size=size)
+    num_filters = None
+    if height is not None and width is not None:
+        num_filters = size / (width * height)
+        assert num_filters * width * height == size, \
+            "size=%s width=%s height=%s" % (size, width, height)
+
+    return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)


 @wrap_name_default("embedding")
@@ -2571,6 +2587,10 @@ def img_pool_layer(input,
        assert input.num_filters is not None
        num_channels = input.num_filters

+    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
+                               CudnnMaxPooling], \
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
+
    if pool_type is None:
        pool_type = MaxPooling()
    elif isinstance(pool_type, AvgPooling):
@@ -2580,7 +2600,6 @@ def img_pool_layer(input,
        if (
        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
        else pool_type.name
-
    pool_size_y = pool_size if pool_size_y is None else pool_size_y
    stride_y = stride if stride_y is None else stride_y
    padding_y = padding if padding_y is None else padding_y
@@ -4204,8 +4223,7 @@ def conv_operator(img,
        num_channels = img.num_filters

    assert isinstance(filter, LayerOutput)
-    if filter.size is not None:
-        filter.size = filter_size * filter_size_y * num_filters * num_channels
+    assert filter.size is not None

    opCls = ConvTransOperator if trans else ConvOperator

@@ -4916,7 +4934,6 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
-    assert input.layer_type == LayerType.CONV_LAYER
    assert isinstance(input.activation, LinearActivation)
    assert groups > 1
    if num_channels is None:

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,6 +13,7 @@ py_test(test_add_two_op SRCS test_add_two_op.py)
 py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
 py_test(test_softmax_op SRCS test_softmax_op.py)
 py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
+py_test(test_gather_op SRCS test_gather_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)

 py_test(gradient_checker SRCS gradient_checker.py)

--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class TestGatherOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "gather"
+        xnp = numpy.random.random((10, 20)).astype("float32")
+        self.inputs = {
+            'X': xnp,
+            'Index': numpy.array([1, 3, 5]).astype("int32")
+        }
+        self.outputs = {'Out': self.inputs['X'][self.inputs['Index']]}
+
+
+class TestGatherGradOp(GradientChecker):
+    def test_gather_grad(self):
+        print 'creating op'
+        op = create_op("gather")
+        print 'creating op done'
+        xnp = numpy.random.random((10, 20)).astype("float32")
+        inputs = {'X': xnp, 'Index': numpy.array([1, 3, 5]).astype("int32")}
+        print 'correct before check gradient'
+        self.check_grad(op, inputs, set("X"), "Out")
+
+
+if __name__ == "__main__":
+    unittest.main()