Refine sum_accumulates_op.

d22f4de7 · wanghaoshuang · cad4d7f3 · d22f4de7 · d22f4de7 · d22f4de7
4 changed file
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {

 template <>
-void getAccumulators<paddle::platform::CPUDeviceContext>(
+void GetAccumulators<paddle::platform::CPUDeviceContext>(
    const framework::ExecutionContext& ctx, int64_t& num_updates_,
    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
@@ -31,7 +31,7 @@ void getAccumulators<paddle::platform::CPUDeviceContext>(
 }

 template <>
-void setAccumulators<paddle::platform::CPUDeviceContext>(
+void SetAccumulators<paddle::platform::CPUDeviceContext>(
    const framework::ExecutionContext& ctx, int64_t num_updates_,
    int64_t num_accumulates_, int64_t old_num_accumulates_) {
  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
@@ -113,60 +113,92 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("param",
-             "Input(Tensor or LoDTensor): The parameter to be accumulated.");
+    AddInput("param", "(Tensor), The parameter to be accumulated.");
    AddInput("in_sum_1",
-             "Input(Tensor or LoDTensor): A tensor used to store the parameter "
+             "(Tensor), A tensor used to store the parameter "
             "sums with the same shape as input(param).");
    AddInput("in_sum_2",
-             "Input(Tensor or LoDTensor): A auxiliary tensor to help "
+             "(Tensor), A auxiliary tensor to help "
             "accumulating sums of parameter values with the same shape as "
             "input(param). It is used to avoid loss of precision due to too "
             "many sums.");
    AddInput("in_sum_3",
-             "Input(Tensor or LoDTensor): A auxiliary tensor to help "
+             "(Tensor), A auxiliary tensor to help "
             "accumulating sums of parameter values with the same shape as "
             "input(param).");
    AddInput("in_num_accumulates",
-             "Input(Tensor): The accumulating times of current window with "
-             "shape [1].");
-    AddInput("in_old_num_accumulates",
-             "Input(Tensor): The accumulating times of previous window with "
+             "(Tensor<int64_t>), The accumulating times of current window with "
             "shape [1].");
+    AddInput(
+        "in_old_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of previous window with "
+        "shape [1].");
    AddInput("in_num_updates",
-             "Input(Tensor): The total number of batches used by trainning "
+             "(Tensor<int64_t>), The total number of batches used by trainning "
             "before this batch with shape [1].");

    AddOutput("out_sum_1",
-              "Output(Tensor or LoDTensor): A tensor used to store the "
+              "(Tensor), A tensor used to store the "
              "parameter sums with the same shape as input(param).");
    AddOutput("out_sum_2",
-              "Output(Tensor or LoDTensor): A auxiliary tensor to help "
+              "(Tensor), A auxiliary tensor to help "
              "accumulating sums of parameter values with the same shape as "
              "input(param). It is used to avoid loss of precision due to too "
              "many sums.");
    AddOutput("out_sum_3",
-              "Output(Tensor or LoDTensor): A auxiliary tensor to help "
+              "(Tensor), A auxiliary tensor to help "
              "accumulating sums of parameter values with the same shape as "
              "input(param).");
-    AddOutput("out_num_accumulates",
-              "Output(Tensor): The accumulating times of current window with "
-              "shape [1].");
-    AddOutput("out_old_num_accumulates",
-              "Output(Tensor): The accumulating times of previous window with "
-              "shape [1].");
-    AddOutput("out_num_updates",
-              "Output(Tensor): The total number of batches used by trainning "
-              "before this batch with shape [1].");
+    AddOutput(
+        "out_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of current window with "
+        "shape [1].");
+    AddOutput(
+        "out_old_num_accumulates",
+        "(Tensor<int64_t>) The accumulating times of previous window with "
+        "shape [1].");
+    AddOutput(
+        "out_num_updates",
+        "(Tensor<int64_t>), The total number of batches used by trainning "
+        "before this batch with shape [1].");

    AddAttr<float>("average_window",
-                   "The rate of average window size relative to num_updates.");
-    AddAttr<int64_t>("max_average_window", "Maximum size of average window.");
-    AddAttr<int64_t>("min_average_window", "Minimu size of average window.");
+                   "(float, default 0) "
+                   "The rate of average window size relative to num_updates.")
+        .SetDefault(0);
+    AddAttr<int64_t>("max_average_window",
+                     "(int64_t) "
+                     "Maximum size of average window. It suggests that the "
+                     "number of mini-batches "
+                     "in one pass is appropriate value to set.");
+    AddAttr<int64_t>("min_average_window",
+                     "(int64_t, default 10000L) "
+                     "Minimu size of average window.")
+        .SetDefault(10000L);

    AddComment(R"DOC(
 AverageAccumulates Operator.
-Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
+'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
+
+All the accumulators were inited to zero before training.
+
+And for a mini-batch in training, accumulators were computed as below steps:
+    num_updates += 1
+    num_accumulates += 1
+    sum_1 += param
+    if num_updates % kMaxNumAccumulates == 0:
+        sum_2 += sum_1
+        sum_1 = 0
+    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
+        sum_3 = sum_1 + sum_2
+        sum_1 = 0
+        sum_2 = 0
+        old_num_accumulates = num_accumulates
+        num_accumulates = 0
+
 )DOC");
  }
 };

--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 template <>
-void getAccumulators<paddle::platform::CUDADeviceContext>(
+void GetAccumulators<paddle::platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx, int64_t& num_updates_,
    int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
@@ -35,7 +35,7 @@ void getAccumulators<paddle::platform::CUDADeviceContext>(
 }

 template <>
-void setAccumulators<paddle::platform::CUDADeviceContext>(
+void SetAccumulators<paddle::platform::CUDADeviceContext>(
    const framework::ExecutionContext& ctx, int64_t num_updates_,
    int64_t num_accumulates_, int64_t old_num_accumulates_) {
  auto stream = ctx.cuda_device_context().stream();

--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -28,12 +28,12 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 template <typename DeviceContext>
-void getAccumulators(const framework::ExecutionContext& ctx,
+void GetAccumulators(const framework::ExecutionContext& ctx,
                     int64_t& num_updates, int64_t& num_accumulates,
                     int64_t& old_num_accumulates);

 template <typename DeviceContext>
-void setAccumulators(const framework::ExecutionContext& ctx,
+void SetAccumulators(const framework::ExecutionContext& ctx,
                     int64_t num_updates, int64_t num_accumulates,
                     int64_t old_num_accumulates);

@@ -47,7 +47,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    int64_t num_updates = 0;
    int64_t num_accumulates = 0;
    int64_t old_num_accumulates = 0;
-    getAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+    GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
                                   old_num_accumulates);

    // Get attrs
@@ -84,6 +84,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    out_sum_2_tensor.device(place) = in_sum_2_tensor;
    out_sum_3_tensor.device(place) = in_sum_3_tensor;
    if (num_updates % kMaxNumAccumulates == 0) {
+      // Move the sum to a different buffer to avoid loss of precision due to
+      // too many sums.
      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                       0.0);
@@ -91,6 +93,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    if (num_accumulates >= min_average_window &&
        num_accumulates >= std::min<int64_t>(max_average_window,
                                             num_updates * average_window)) {
+      //  Now the average window is too long, discard the old sum.
      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
                       0.0);
@@ -101,7 +104,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
    }

    // Set accumulators to output
-    setAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
                                   old_num_accumulates);
  }
 };

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -732,7 +732,6 @@ class ModelAverage(Optimizer):
        """Apply average values to parameters of current model.
        """
        executor.run(self.apply_program)
-        print "finish apply"
        try:
            yield
        finally:
@@ -743,4 +742,3 @@ class ModelAverage(Optimizer):
        """Restore parameter values of current model.
        """
        executor.run(self.restore_program)
-        print "finish restore"