add base class for seqlastin/max/average layer (#187)

e1f57bfd · luotao1 · emailweixu · 76fb74dc · e1f57bfd · e1f57bfd
7 changed file
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "AverageLayer.h"
 #include "paddle/utils/Logging.h"
@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
 bool AverageLayer::init(const LayerMap& layerMap,
                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
+  SequencePoolLayer::init(layerMap, parameterMap);
-  Layer::init(layerMap, parameterMap);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
  dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
  outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
  // average strategy
@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
  } else {
    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
  return true;
 }
 void AverageLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  // average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
-  }
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  size_t numSequences = startPositions->getSize() - 1;
-  // check
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(starts[numSequences], input.getBatchSize());
-  CHECK_EQ(dim, input.value->getWidth());
-  resetOutput(newBatchSize, dim);
-  auto startsPos = startPositions->getVector(useGpu_);
  MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
-  }
  /* add the bias-vector AFTER average operation */
  if (biases_.get() != NULL) {
@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
 }
 void AverageLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
+  SequencePoolLayer::backward(callback);
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  /* Do derivation */ { backwardActivation(); }
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  const int* starts = startPositions_->getData(false);
  MatrixPtr grad = getInputGrad(0);
  if (grad) {
    size_t dim = getSize();
    real* gradientData = getInputGrad(0)->getData();
    real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions->getSize() - 1;
+    size_t numSequences = startPositions_->getSize() - 1;
    for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
      // TODO(Dangqingqing) optimization for GPU
      int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];

--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 namespace paddle {
@@ -23,20 +22,21 @@ namespace paddle {
 /**
 * A layer for "internal average" for sequence input.
 * Input: one or more sequences. Each sequence contains some instances.
- * If AverageLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = average_{for each instance in this sequence}{input[i]}
- * If AverageLevel = kSeq:
+ * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
 *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
 */
+class AverageLayer : public SequencePoolLayer {
-class AverageLayer : public Layer {
 public:
  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  enum AverageLevel { kNonSeq = 0, kSeq = 1 };
+  explicit AverageLayer(const LayerConfig& config)
-  explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+      : SequencePoolLayer(config) {}
  ~AverageLayer() {}
@@ -46,11 +46,8 @@ public:
  void backward(const UpdateCallback& callback = nullptr);
 protected:
-  std::unique_ptr<Weight> biases_;
  MatrixPtr outMtx_;
  MatrixPtr dataMtx_;
  int mode_;
-  int type_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "MaxLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -21,55 +20,11 @@ namespace paddle {
 REGISTER_LAYER(max, MaxLayer);
-bool MaxLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
 void MaxLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  // max layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  auto starts = startPositions->getVector(useGpu_);
-  size_t numSequences = startPositions->getSize() - 1;
-  CHECK_EQ(dim, input.value->getWidth());
+  IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
+                          useGpu(deviceId_));
-  CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-  // reset output: resize to "num of sequences", not "batch size".
-  resetOutput(newBatchSize, dim);
-  IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
  maxIndex_->zeroMem();
  MatrixPtr inputValue = getInputValue(0);
@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
  {
    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
+    outputValue->maxSequenceForward(
-  }
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no cpuSequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new cpuSequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
  }
  if (config_.output_max_index()) {
@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
 void MaxLayer::backward(const UpdateCallback& callback) {
  CHECK(!config_.output_max_index())
      << "backward is not available when output_max_index is set";
-  /* Do derivation */ { backwardActivation(); }
+  SequencePoolLayer::backward(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
  if (inputGrad) {
-    ICpuGpuVectorPtr starts =
-        type_ ? getInput(0).subSequenceStartPositions
-              : getInput(0).sequenceStartPositions;
    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(*outputGrad,
+    inputGrad->maxSequenceBackward(
-        *(starts->getVector(useGpu_)), *maxIndex_);
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
  }
 }

--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ThreadLocal.h"
@@ -24,29 +24,30 @@ namespace paddle {
 /**
 * A layer for "internal max" for sequence input.
 * Input: one or more sequences. Each sequence contains some instances.
- * If MaxLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = max_{for each instance in this sequence}{input[i]}
- * If MaxLevel = kSeq:
+ * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
 *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
 */
-class MaxLayer : public Layer {
+class MaxLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
  IVectorPtr maxIndex_;
-  int type_;
 public:
-  explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-  enum MaxLevel {kNonSeq = 0, kSeq = 1 };
  ~MaxLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
  void forward(PassType passType);
  void backward(const UpdateCallback& callback = nullptr);

--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
@@ -30,19 +30,18 @@ namespace paddle {
 *   Check input sequence must has sub-sequence
 *   Output: a sequence containing only the last instance of each sub-sequence
 *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
 */
-class SequenceLastInstanceLayer : public Layer {
+class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
  MatrixPtr tmpSrc_;
  MatrixPtr tmpDest_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  int type_;
 public:
  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : Layer(config) {}
+      : SequencePoolLayer(config) {}
  ~SequenceLastInstanceLayer() {}
@@ -56,56 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
+  SequencePoolLayer::init(layerMap, parameterMap);
-  Layer::init(layerMap, parameterMap);
-  // seqlastins layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
  tmpSrc_ =
      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
  tmpDest_ =
      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
  return true;
 }
 void SequenceLastInstanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  // check
+  const int* starts = startPositions_->getData(false);
-  CHECK(input.sequenceStartPositions);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
-  }
-  auto startPositions =
-      type_ ? input.subSequenceStartPositions->getVector(false)
-            : input.sequenceStartPositions->getVector(false);
-  size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
-  CHECK_EQ(height, startPositions->getSize() - 1);
-  reserveOutput(height, dim);
-  const int* starts = startPositions->getData();
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();
@@ -113,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
    AsyncGpuBlock asyncGpuBlock;
    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-    for (size_t seqId = 0; seqId < height; ++seqId) {
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
      int insId =
          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
      outputValue->subMatrix(seqId, 1, tmpDest_)
          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
    }
-    /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-     * thus, in this case, output_ has no sequenceStartPositions.
-     * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-     * case, we should compute the new sequenceStartPositions.
-    */
-    if (type_) {
-      output_.degradeSequence(input, useGpu_);
-    }
  }
  if (biases_.get() != NULL) {
@@ -139,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
 }
 void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  /* activation, should set to 'linear' in most cases */
+  SequencePoolLayer::backward(callback);
-  backwardActivation();
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions =
+  const int* starts = startPositions_->getData(false);
-      type_ ? getInput(0).subSequenceStartPositions->getVector(false)
+  size_t numSequences = startPositions_->getSize() - 1;
-            : getInput(0).sequenceStartPositions->getVector(false);
-  const int* starts = startPositions->getData();
-  size_t numSequences = startPositions->getSize() - 1;
  if (inputGrad) {
    AsyncGpuBlock asyncGpuBlock;

--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/utils/Logging.h"
+#include "SequencePoolLayer.h"
+namespace paddle {
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+  resetOutput(newBatchSize_, dim);
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+      << "when trans_type = seq, input must hasSubseq";
+  }
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+}
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+class SequencePoolLayer : public Layer {
+protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~SequencePoolLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle